Snap for 9591423 from db4553b1a39ef8ef84a097dfa2e795c0a4df60d8 to emu-33-releaseemu-33-release

Change-Id: Ice8f28ef8b14d8ae68e9fa363e93f38ecad69969
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2023-02-10 17:17:42 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2023-02-10 17:17:42 +0000
commit: 910e0feba4baf9c8306748c8c792ded480cedaa0 (patch)
tree: 61f88d9a095a390cab484c109bd55ce293b117b0
parent: db1632fadc79df388a2a7248157c965ee40248a1 (diff)
parent: db4553b1a39ef8ef84a097dfa2e795c0a4df60d8 (diff)
download: google-benchmark-emu-33-release.tar.gz
158 files changed, 16170 insertions, 5042 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..e7d00fe
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+PointerAlignment: Left
+...
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..6c2ced9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..9e8ab6a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml
new file mode 100644
index 0000000..d6bbe62
--- /dev/null
+++ b/.github/workflows/bazel.yml
@@ -0,0 +1,33 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: mount bazel cache
+      uses: actions/cache@v1
+      with:
+        path: "/home/runner/.cache/bazel"
+        key: bazel
+
+    - name: install bazelisk
+      run: |
+        curl -LO "https://github.com/bazelbuild/bazelisk/releases/download/v1.1.0/bazelisk-linux-amd64"
+        mkdir -p "${GITHUB_WORKSPACE}/bin/"
+        mv bazelisk-linux-amd64 "${GITHUB_WORKSPACE}/bin/bazel"
+        chmod +x "${GITHUB_WORKSPACE}/bin/bazel"
+
+    - name: build
+      run: |
+        "${GITHUB_WORKSPACE}/bin/bazel" build //...
+        
+    - name: test
+      run: |
+        "${GITHUB_WORKSPACE}/bin/bazel" test //test/...
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000..f0f0626
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,38 @@
+name: build-and-test
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest, windows-latest]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }}
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000..c869674
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,26 @@
+name: pylint
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  pylint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pylint-exit conan
+    - name: Run pylint
+      run: |
+        pylint `find . -name '*.py'|xargs` || pylint-exit $?
diff --git a/.github/workflows/test_bindings.yml b/.github/workflows/test_bindings.yml
new file mode 100644
index 0000000..273d7f9
--- /dev/null
+++ b/.github/workflows/test_bindings.yml
@@ -0,0 +1,24 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  python_bindings:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+      - name: Install benchmark
+        run:
+          python setup.py install
+      - name: Run example bindings
+        run:
+          python bindings/python/google_benchmark/example.py
diff --git a/.gitignore b/.gitignore
index 3c1b4f2..be55d77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,9 @@
 *.dylib
 *.cmake
 !/cmake/*.cmake
+!/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
 
@@ -41,6 +43,24 @@ build.ninja
 install_manifest.txt
 rules.ninja
 
+# bazel output symlinks.
+bazel-*
+
 # out-of-source build top-level folders.
 build/
 _build/
+build*/
+
+# in-source dependencies
+/googletest/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
new file mode 100644
index 0000000..a591743
--- /dev/null
+++ b/.travis-libcxx-setup.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Install a newer CMake version
+curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
+chmod +x install-cmake.sh
+sudo ./install-cmake.sh --prefix=/usr/local --skip-license
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
+git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
+git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
+
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
+      -DLIBCXX_ABI_UNSTABLE=ON \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
+      ../llvm-source
+make cxx -j2
+sudo make install-cxxabi install-cxx
+cd ../
diff --git a/.travis-setup.sh b/.travis-setup.sh
deleted file mode 100644
index c900fa9..0000000
--- a/.travis-setup.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-# Before install
-
-sudo add-apt-repository -y ppa:kalakris/cmake
-if [ "$STD" = "c++11" ]; then
-    sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-    if [ "$CXX" = "clang++" ]; then
-        wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
-        sudo add-apt-repository -y "deb http://llvm.org/apt/precise/ llvm-toolchain-precise-3.6 main"
-    fi
-fi
-sudo apt-get update -qq
-
-# Install
-sudo apt-get install -qq cmake
-if [ "$STD" = "c++11" ] && [ "$CXX" = "g++" ]; then
-    sudo apt-get install -qq gcc-4.8 g++-4.8
-    sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90
-    sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
-elif [ "$CXX" = "clang++" ]; then
-    sudo apt-get install -qq clang-3.6
-    sudo update-alternatives --install /usr/local/bin/clang   clang   /usr/bin/clang-3.6 90
-    sudo update-alternatives --install /usr/local/bin/clang++ clang++ /usr/bin/clang++-3.6 90
-    export PATH=/usr/local/bin:$PATH
-fi
diff --git a/.travis.yml b/.travis.yml
index bf26395..36e343d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,43 +1,231 @@
+sudo: required
+dist: trusty
 language: cpp
 
-# NOTE: The COMPILER variable is unused. It simply makes the display on
-# travis-ci.org more readable.
 matrix:
-    include:
-        - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Coverage
-        - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Debug
-        - compiler: gcc
-          env: COMPILER=g++-4.6     STD=c++0x BUILD_TYPE=Release
-        - compiler: gcc
-          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Debug
-        - compiler: gcc
-          env: COMPILER=g++-4.8     STD=c++11 BUILD_TYPE=Release
-        - compiler: clang
-          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Debug
-        - compiler: clang
-          env: COMPILER=clang++-3.6 STD=c++11 BUILD_TYPE=Release
+  include:
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - lcov
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Debug
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
+    # Clang w/ libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++, ASAN, UBSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+        - UBSAN_OPTIONS=print_stacktrace=1
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Debug
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Release
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - os: osx
+      osx_image: xcode9.4
+      compiler: gcc
+      env:
+        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
 
 before_script:
-    - source .travis-setup.sh
-    - mkdir build && cd build
+  - if [ -n "${LIBCXX_BUILD}" ]; then
+      source .travis-libcxx-setup.sh;
+    fi
+  - if [ -n "${ENABLE_SANITIZER}" ]; then
+      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
+    else
+      export EXTRA_OPTIONS="";
+    fi
+  - mkdir -p build && cd build
+
+before_install:
+  - if [ -z "$BUILD_32_BITS" ]; then
+      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
+    fi
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
+      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
+    fi
 
 install:
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
+      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
+      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
+    fi
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
       PATH=~/.local/bin:${PATH};
       pip install --user --upgrade pip;
-      pip install --user cpp-coveralls;
+      travis_wait pip install --user cpp-coveralls;
+    fi
+  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
+      rm -f /usr/local/include/c++;
+      brew update;
+      travis_wait brew install gcc@7;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
+      sudo apt-get update -qq;
+      sudo apt-get install -qq unzip cmake3;
+      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      travis_wait sudo bash bazel-installer.sh;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
+      travis_wait sudo bash bazel-installer.sh;
     fi
 
 script:
-    - cmake .. -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="-std=${STD}"
-    - make
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
+  - make
+  - ctest -C ${BUILD_TYPE} --output-on-failure
+  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
 
 after_success:
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
       coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
     fi
-
-sudo: required
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index 8619435..5649ddc 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -7,7 +7,7 @@ import ycm_core
 flags = [
 '-Wall',
 '-Werror',
-'-pendantic-errors',
+'-pedantic-errors',
 '-std=c++0x',
 '-fno-strict-aliasing',
 '-O3',
diff --git a/AUTHORS b/AUTHORS
index 0f93e01..3068b2e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -9,25 +9,50 @@
 # Please keep the list sorted.
 
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Carto
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
+Deniz Evrenci <denizevrenci@gmail.com>
+Dirac Research 
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
+Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
+MongoDB Inc.
+Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steinar H. Gunderson <sgunderson@bigfoot.com>
+Stripe, Inc.
+Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
-Dirac Research 
 Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/Android.bp b/Android.bp
index 14a598b..1f1a2d6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -14,36 +14,57 @@
 // limitations under the License.
 //
 
-cc_library_static {
-    name: "libgoogle-benchmark",
+package {
+    default_applicable_licenses: ["external_google-benchmark_license"],
+}
+
+// Added automatically by a large-scale-change
+license {
+    name: "external_google-benchmark_license",
+    visibility: [":__subpackages__"],
+    license_kinds: [
+        "SPDX-license-identifier-Apache-2.0",
+    ],
+    license_text: [
+        "LICENSE",
+    ],
+}
+
+cc_defaults {
+    name: "libgoogle-benchmark-defaults",
     host_supported: true,
-    target: {
-        darwin: {
-            enabled: false,
-        },
-    },
     local_include_dirs: ["include"],
+    vendor_available: true,
     cflags: [
         "-DBENCHMARK_ANDROID",
         "-DHAVE_POSIX_REGEX",
+        "-Werror",
+        "-Wno-deprecated-declarations",
     ],
-
     srcs: [
-        "src/benchmark.cc",
-        "src/colorprint.cc",
-        "src/commandlineflags.cc",
-        "src/complexity.cc",
-        "src/console_reporter.cc",
-        "src/csv_reporter.cc",
-        "src/json_reporter.cc",
-        "src/log.cc",
-        "src/reporter.cc",
-        "src/re_posix.cc",
-        "src/sleep.cc",
-        "src/string_util.cc",
-        "src/sysinfo.cc",
-        "src/walltime.cc",
+        "src/*.cc",
     ],
     export_include_dirs: ["include"],
 }
 
+// For benchmarks that define their own main().
+cc_library_static {
+    name: "libgoogle-benchmark",
+    defaults: ["libgoogle-benchmark-defaults"],
+    exclude_srcs: [
+        "src/benchmark_main.cc",
+    ],
+}
+
+// For benchmarks that want to use the default main().
+// Make sure this dependency is in the whole_static_libs attribute.
+cc_library_static {
+    name: "libgoogle-benchmark-main",
+    defaults: ["libgoogle-benchmark-defaults"],
+}
+
+cc_test {
+    name: "google-benchmark-test",
+    srcs: ["test/basic_test.cc"],
+    static_libs: ["libgoogle-benchmark"],
+}
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 0000000..eb35b62
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,44 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+licenses(["notice"])
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "benchmark",
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = ["include/benchmark/benchmark.h"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
+    hdrs = ["include/benchmark/benchmark.h"],
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    deps = [":benchmark"],
+)
+
+cc_library(
+    name = "benchmark_internal_headers",
+    hdrs = glob(["src/*.h"]),
+    visibility = ["//test:__pkg__"],
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1251e7..1007254 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,27 +1,89 @@
-cmake_minimum_required (VERSION 2.8.11)
-project (benchmark)
+cmake_minimum_required (VERSION 3.5.1)
 
 foreach(p
+    CMP0048 # OK to clear PROJECT_VERSION on project()
     CMP0054 # CMake 3.1
     CMP0056 # export EXE_LINKER_FLAGS to try_run
+    CMP0057 # Support no if() IN_LIST operator
+    CMP0063 # Honor visibility properties for all targets
+    CMP0077 # Allow option() overrides in importing projects
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
   endif()
 endforeach()
 
+project (benchmark CXX)
+
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
+option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+
+# Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
+# may require downloading the source code.
+option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree building of unmet dependencies" OFF)
+
+# This option can be used to disable building and running unit tests which depend on gtest
+# in cases where it is not possible to build or find a valid version of gtest.
+option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
+function(should_enable_assembly_tests)
+  if(CMAKE_BUILD_TYPE)
+    string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+    if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+      # FIXME: The --coverage flag needs to be removed when building assembly
+      # tests for this to work.
+      return()
+    endif()
+  endif()
+  if (MSVC)
+    return()
+  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    return()
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # FIXME: Make these work on 32 bit builds
+    return()
+  elseif(BENCHMARK_BUILD_32_BITS)
+     # FIXME: Make these work on 32 bit builds
+    return()
+  endif()
+  find_program(LLVM_FILECHECK_EXE FileCheck)
+  if (LLVM_FILECHECK_EXE)
+    set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
+    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
+  else()
+    message(STATUS "Failed to find LLVM FileCheck")
+    return()
+  endif()
+  set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
+endfunction()
+should_enable_assembly_tests()
+
+# This option disables the building and running of the assembly verification tests
+option(BENCHMARK_ENABLE_ASSEMBLY_TESTS "Enable building and running the assembly tests"
+    ${ENABLE_ASSEMBLY_TESTS_DEFAULT})
+
 # Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
+
 # Read the git tags to determine the project version
 include(GetGitVersion)
 get_git_version(GIT_VERSION)
 
 # Tell the user what versions we are using
 string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message("-- Version: ${VERSION}")
+message(STATUS "Version: ${VERSION}")
 
 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
@@ -32,12 +94,21 @@ include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
+endif()
+
+if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-EHs-)
+    add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
+  endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
@@ -68,31 +139,63 @@ else()
 
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
-
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Werror RELEASE)
   add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
   add_cxx_compiler_flag(-Werror MINSIZEREL)
-  add_cxx_compiler_flag(-pedantic)
-  add_cxx_compiler_flag(-pedantic-errors)
+  # Disabled until googletest (gmock) stops emitting variadic macro warnings
+  #add_cxx_compiler_flag(-pedantic)
+  #add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
-  add_cxx_compiler_flag(-Wfloat-equal)
-  add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
   add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
+  endif()
+
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    add_cxx_compiler_flag(-Wstrict-aliasing)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
   endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_WTHREAD_SAFETY)
-    add_definitions(-DHAVE_WTHREAD_SAFETY)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
   endif()
 
+  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
+  # predefined macro, which turns on all of the wonderful libc extensions.
+  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # since we depend on GNU/POSIX/BSD extensions.
+  if (CYGWIN)
+    add_definitions(-D_GNU_SOURCE=1)
+  endif()
+
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
       find_program(GCC_AR gcc-ar)
       if (GCC_AR)
         set(CMAKE_AR ${GCC_AR})
@@ -101,38 +204,70 @@ else()
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+      include(llvm-toolchain)
     endif()
   endif()
 
   # Coverage build type
-  set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the C++ compiler during coverage builds."
+  set(BENCHMARK_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
     FORCE)
-  set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    "${CMAKE_EXE_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used for linking binaries during coverage builds."
+  set(BENCHMARK_EXE_LINKER_FLAGS_COVERAGE "${CMAKE_EXE_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used for linking binaries during coverage builds."
     FORCE)
-  set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" CACHE STRING
-    "Flags used by the shared libraries linker during coverage builds."
+  set(BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
     FORCE)
   mark_as_advanced(
-    CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
+    BENCHMARK_CXX_FLAGS_COVERAGE
+    BENCHMARK_EXE_LINKER_FLAGS_COVERAGE
+    BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE)
   set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
-    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage."
-    FORCE)
+    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
   add_cxx_compiler_flag(--coverage COVERAGE)
 endif()
 
+if (BENCHMARK_USE_LIBCXX)
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    add_cxx_compiler_flag(-stdlib=libc++)
+  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+    add_cxx_compiler_flag(-nostdinc++)
+    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
+    # configuration checks such as 'find_package(Threads)'
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
+    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
+    # linker flags appear before all linker inputs and -lc++ must appear after.
+    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
+  else()
+    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+  endif()
+endif(BENCHMARK_USE_LIBCXX)
+
+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+endif()
+
 # C++ feature checks
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
-cxx_feature_check(STEADY_CLOCK)
+# Determine the correct regular expression engine to use
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
+if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
+if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
+        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
+endif()
 
+cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
 # Set up directories
@@ -143,5 +278,10 @@ add_subdirectory(src)
 
 if (BENCHMARK_ENABLE_TESTING)
   enable_testing()
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    include(GoogleTest)
+  endif()
   add_subdirectory(test)
 endif()
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 4bff126..b5e1aa4 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -22,31 +22,62 @@
 #
 # Please keep the list sorted.
 
+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
-Dominic Hamon <dma@stripysock.com>
+Deniz Evrenci <denizevrenci@gmail.com>
+Dominic Hamon <dma@stripysock.com> <dominic@google.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
+Hannes Hauswedell <h2@fsfe.org>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
+Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raul Marin <rmrodriguez@cartodb.com>
+Ray Glover <ray.glover@uk.ibm.com>
+Robert Guo <robert.guo@mongodb.com>
+Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
-Yusuke Suzuki <utatane.tea@gmail.com>
+Steven Wan <wan.yu@ibm.com>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
+Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
+Yixuan Qiu <yixuanq@gmail.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
-Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..0584c04
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "google-benchmark"
+description: "A library to support the benchmarking of functions, similar to unit-tests."
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://github.com/google/benchmark"
+  }
+  url {
+    type: GIT
+    value: "https://github.com/google/benchmark.git"
+  }
+  version: "ea5a5bbff491fd625c6e3458f6edd680b8bd5452"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2021
+    month: 2
+    day: 12
+  }
+}
diff --git a/NOTICE b/NOTICE
deleted file mode 120000
index 7a694c9..0000000
--- a/NOTICE
+++ /dev/null
@@ -1 +0,0 @@
-LICENSE
-\ No newline at end of file
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..7529cb9
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/system/core:/janitors/OWNERS
diff --git a/README.md b/README.md
index a0bcc61..6c09b9d 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,166 @@
-# benchmark
+# Benchmark
+
+[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
+[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
+[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
+
 [![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
 [![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
 
-A library to support the benchmarking of functions, similar to unit-tests.
 
-Discussion group: https://groups.google.com/d/forum/benchmark-discuss
+A library to benchmark code snippets, similar to unit tests. Example:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_SomeFunction(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    SomeFunction();
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SomeFunction);
+// Run the benchmark
+BENCHMARK_MAIN();
+```
+
+To get started, see [Requirements](#requirements) and
+[Installation](#installation). See [Usage](#usage) for a full example and the
+[User Guide](#user-guide) for a more comprehensive feature overview.
+
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
+
+### Resources
+
+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
+
+IRC channel: [freenode](https://freenode.net) #googlebenchmark
+
+[Additional Tooling Documentation](docs/tools.md)
+
+[Assembly Testing Documentation](docs/AssemblyTests.md)
+
+## Requirements
+
+The library can be used with C++03. However, it requires C++11 to build,
+including compiler and standard library support.
+
+The following minimum versions are required to build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 14 2015
+* Intel 2015 Update 1
+
+See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
+
+## Installation
+
+This describes the installation process using cmake. As pre-requisites, you'll
+need git and cmake installed.
+
+_See [dependencies.md](dependencies.md) for more details regarding supported
+versions of build tools._
+
+```bash
+# Check out the library.
+$ git clone https://github.com/google/benchmark.git
+# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
+$ git clone https://github.com/google/googletest.git benchmark/googletest
+# Go to the library root directory
+$ cd benchmark
+# Make a build directory to place the build output.
+$ cmake -E make_directory "build"
+# Generate build system files with cmake.
+$ cmake -E chdir "build" cmake -DCMAKE_BUILD_TYPE=Release ../
+# or, starting with CMake 3.13, use a simpler form:
+# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# Build the library.
+$ cmake --build "build" --config Release
+```
+This builds the `benchmark` and `benchmark_main` libraries and tests.
+On a unix system, the build directory should now look something like this:
+
+```
+/benchmark
+  /build
+    /src
+      /libbenchmark.a
+      /libbenchmark_main.a
+    /test
+      ...
+```
+
+Next, you can run the tests to check the build.
+
+```bash
+$ cmake -E chdir "build" ctest --build-config Release
+```
+
+If you want to install the library globally, also run:
+
+```
+sudo cmake --build "build" --config Release --target install
+```
+
+Note that Google Benchmark requires Google Test to build and run the tests. This
+dependency can be provided two ways:
+
+* Checkout the Google Test sources into `benchmark/googletest` as above.
+* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
+  configuration, the library will automatically download and build any required
+  dependencies.
+
+If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
+to `CMAKE_ARGS`.
+
+### Debug vs Release
+
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, add
+`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
+above. The use of `--config Release` in build commands is needed to properly
+support multi-configuration tools (like Visual Studio for example) and can be
+skipped for other build systems (like Makefile).
+
+To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
+generating the build system files.
+
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
 
-IRC channel: https://freenode.net #googlebenchmark
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
+
+### Stable and Experimental Library Versions
+
+The main branch contains the latest stable version of the benchmarking library;
+the API of which can be considered largely stable, with source breaking changes
+being made only upon the release of a new major version.
+
+Newer, experimental, features are implemented and tested on the
+[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
+to use, test, and provide feedback on the new features are encouraged to try
+this branch. However, this branch provides no stability guarantees and reserves
+the right to change and break the API at any time.
+
+## Usage
 
-## Example usage
 ### Basic usage
-Define a function that executes the code to be measured.
+
+Define a function that executes the code to measure, register it as a benchmark
+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
+is available:
 
 ```c++
+#include <benchmark/benchmark.h>
+
 static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string empty_string;
 }
 // Register the function as a benchmark
@@ -24,7 +169,7 @@ BENCHMARK(BM_StringCreation);
 // Define another benchmark
 static void BM_StringCopy(benchmark::State& state) {
   std::string x = "hello";
-  while (state.KeepRunning())
+  for (auto _ : state)
     std::string copy(x);
 }
 BENCHMARK(BM_StringCopy);
@@ -32,7 +177,291 @@ BENCHMARK(BM_StringCopy);
 BENCHMARK_MAIN();
 ```
 
-### Passing arguments
+To run the benchmark, compile and link against the `benchmark` library
+(libbenchmark.a/.so). If you followed the build steps above, this library will 
+be under the build directory you created.
+
+```bash
+# Example on linux after running the build steps above. Assumes the
+# `benchmark` and `build` directories are under the current directory.
+$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
+  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
+```
+
+Alternatively, link against the `benchmark_main` library and remove
+`BENCHMARK_MAIN();` above to get the same behavior.
+
+The compiled executable will run all benchmarks by default. Pass the `--help`
+flag for option information or see the guide below.
+
+### Usage with CMake
+
+If using CMake, it is recommended to link against the project-provided
+`benchmark::benchmark` and `benchmark::benchmark_main` targets using
+`target_link_libraries`.
+It is possible to use ```find_package``` to import an installed version of the
+library.
+```cmake
+find_package(benchmark REQUIRED)
+```
+Alternatively, ```add_subdirectory``` will incorporate the library directly in
+to one's CMake project.
+```cmake
+add_subdirectory(benchmark)
+```
+Either way, link to the library as follows.
+```cmake
+target_link_libraries(MyTarget benchmark::benchmark)
+```
+
+## Platform Specific Build Instructions
+
+### Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+### Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+### Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+### Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
+
+## User Guide
+
+### Command Line
+
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running Benchmarks](#running-benchmarks)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+### Library
+
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Passing Arguments](#passing-arguments)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+
+<a name="output-formats" />
+
+### Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag (or set the
+`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
+the format type. `console` is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+### Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option
+(or set `BENCHMARK_OUT`). Specify the output format with
+`--benchmark_out_format={json|console|csv}` (or set
+`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that specifying
+`--benchmark_out` does not suppress the console output.
+
+<a name="running-benchmarks" />
+
+### Running Benchmarks
+
+Benchmarks are executed by running the produced binaries. Benchmarks binaries,
+by default, accept options that may be specified either through their command
+line interface or by setting environment variables before execution. For every
+`--option_flag=<value>` CLI switch, a corresponding environment variable
+`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
+ prevails). A complete list of CLI options is available running benchmarks
+ with the `--help` switch.
+
+<a name="running-a-subset-of-benchmarks" />
+
+### Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
+environment variable) can be used to only run the benchmarks that match
+the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+<a name="result-comparison" />
+
+### Result comparison
+
+It is possible to compare the benchmarking results.
+See [Additional Tooling Documentation](docs/tools.md)
+
+<a name="runtime-and-reporting-considerations" />
+
+### Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+<a name="passing-arguments" />
+
+### Passing Arguments
+
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
 run. For example, the following code defines a family of benchmarks for
@@ -40,13 +469,13 @@ measuring the speed of `memcpy()` calls of different lengths:
 
 ```c++
 static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()];
-  char* dst = new char[state.range_x()];
-  memset(src, 'x', state.range_x());
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range_x());
+  char* src = new char[state.range(0)];
+  char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
   state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range_x()));
+                          int64_t(state.range(0)));
   delete[] src;
   delete[] dst;
 }
@@ -68,31 +497,50 @@ range multiplier is changed to multiples of two.
 ```c++
 BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
 ```
+
 Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
 
-You might have a benchmark that depends on two inputs. For example, the
+The preceding code shows a method of defining a sparse range.  The following
+example shows a method of defining a dense range. It is then used to benchmark
+the performance of `std::vector` initialization for uniformly increasing sizes.
+
+```c++
+static void BM_DenseRange(benchmark::State& state) {
+  for(auto _ : state) {
+    std::vector<int> v(state.range(0), state.range(0));
+    benchmark::DoNotOptimize(v.data());
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
+```
+
+Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
+
+You might have a benchmark that depends on two or more inputs. For example, the
 following code defines a family of benchmarks for measuring the speed of set
 insertion.
 
 ```c++
 static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  std::set<int> data;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range_x());
+    data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
+    for (int j = 0; j < state.range(1); ++j)
       data.insert(RandomNumber());
   }
 }
 BENCHMARK(BM_SetInsert)
-    ->ArgPair(1<<10, 1)
-    ->ArgPair(1<<10, 8)
-    ->ArgPair(1<<10, 64)
-    ->ArgPair(1<<10, 512)
-    ->ArgPair(8<<10, 1)
-    ->ArgPair(8<<10, 8)
-    ->ArgPair(8<<10, 64)
-    ->ArgPair(8<<10, 512);
+    ->Args({1<<10, 128})
+    ->Args({2<<10, 128})
+    ->Args({4<<10, 128})
+    ->Args({8<<10, 128})
+    ->Args({1<<10, 512})
+    ->Args({2<<10, 512})
+    ->Args({4<<10, 512})
+    ->Args({8<<10, 512});
 ```
 
 The preceding code is quite repetitive, and can be replaced with the following
@@ -101,7 +549,30 @@ product of the two specified ranges and will generate a benchmark for each such
 pair.
 
 ```c++
-BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+Some benchmarks may require specific argument values that cannot be expressed
+with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
+benchmark input for each combination in the product of the supplied vectors.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 20})
+    ->Args({3<<10, 20})
+    ->Args({8<<10, 20})
+    ->Args({3<<10, 40})
+    ->Args({8<<10, 40})
+    ->Args({1<<10, 40})
+    ->Args({1<<10, 60})
+    ->Args({3<<10, 60})
+    ->Args({8<<10, 60})
+    ->Args({1<<10, 80})
+    ->Args({3<<10, 80})
+    ->Args({8<<10, 80});
 ```
 
 For more complex patterns of inputs, passing a custom function to `Apply` allows
@@ -113,24 +584,49 @@ and a sparse range on the second.
 static void CustomArguments(benchmark::internal::Benchmark* b) {
   for (int i = 0; i <= 10; ++i)
     for (int j = 32; j <= 1024*1024; j *= 8)
-      b->ArgPair(i, j);
+      b->Args({i, j});
 }
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-### Calculate asymptotic complexity (Big O)
+#### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+### Calculating Asymptotic Complexity (Big O)
+
 Asymptotic complexity might be calculated for a family of benchmarks. The
 following code will calculate the coefficient for the high-order term in the
 running time and the normalized root-mean square error of string comparison.
 
 ```c++
 static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range_x(), '-');
-  std::string s2(state.range_x(), '-');
-  while (state.KeepRunning()) {
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
+  for (auto _ : state) {
     benchmark::DoNotOptimize(s1.compare(s2));
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
 BENCHMARK(BM_StringCompare)
     ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
@@ -149,27 +645,29 @@ that might be used to customize high-order term calculation.
 
 ```c++
 BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; });
+    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
 ```
 
-### Templated benchmarks
-Templated benchmarks work the same way: This example produces and consumes
-messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
-absence of multiprogramming.
+<a name="templated-benchmarks" />
+
+### Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
 
 ```c++
-template <class Q> int BM_Sequential(benchmark::State& state) {
+template <class Q> void BM_Sequential(benchmark::State& state) {
   Q q;
   typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range_x(); i--; )
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
       q.push(v);
-    for (int e = state.range_x(); e--; )
+    for (int e = state.range(0); e--; )
       q.Wait(&v);
   }
   // actually messages, not bytes:
   state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range_x());
+      static_cast<int64_t>(state.iterations())*state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 ```
@@ -177,7 +675,7 @@ BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 Three macros are provided for adding benchmark templates.
 
 ```c++
-#if __cplusplus >= 201103L // C++11 and greater.
+#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
 #else // C++ < C++11
 #define BENCHMARK_TEMPLATE(func, arg1)
@@ -186,31 +684,223 @@ Three macros are provided for adding benchmark templates.
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
-## Passing arbitrary arguments to a benchmark
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
+<a name="fixtures" />
+
+### Fixtures
+
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
 
 ```c++
-template <class ...ExtraArgs>`
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
 }
-// Registers a benchmark named "BM_takes_args/int_string_test` that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
 ```
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
 
-### Multithreaded benchmarks
+#### Templated Fixtures
+
+Also you can create templated fixture by using the following macros:
+
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
+
+For example:
+
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+```
+
+<a name="custom-counters" />
+
+### Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  for (auto _ : state) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants,
+and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
+is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  // Meaning: per one second, how many 'foo's are processed?
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark, and the result inverted.
+  // Meaning: how many seconds it takes to process one 'foo'?
+  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+
+#### Counter Reporting
+
+When using the console reporter, by default, user counters are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+### Multithreaded Benchmarks
+
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have called
-`KeepRunning`, and all will have finished before KeepRunning returns false. As
-such, any global setup or teardown can be wrapped in a check against the thread
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
 index:
 
 ```c++
@@ -218,7 +908,7 @@ static void BM_MultiThreaded(benchmark::State& state) {
   if (state.thread_index == 0) {
     // Setup code here.
   }
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     // Run the test as normal.
   }
   if (state.thread_index == 0) {
@@ -238,14 +928,84 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
 
 Without `UseRealTime`, CPU time is used by default.
 
+<a name="cpu-timers" />
+
+### CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0));
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, an will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+#### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, although it is not recommended, since it has high overhead.
+
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+<a name="manual-timing" />
+
+### Manual Timing
 
-## Manual timing
 For benchmarking something for which neither CPU time nor real-time are
 correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function. 
+the `UseManualTime` function.
 
 When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the `KeepRunning` loop to
+`SetIterationTime` once per iteration of the benchmark loop to
 report the manually measured time.
 
 An example use case for this is benchmarking GPU execution (e.g. OpenCL
@@ -256,16 +1016,16 @@ can be reported back with `SetIterationTime`.
 
 ```c++
 static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range_x();
+  int microseconds = state.range(0);
   std::chrono::duration<double, std::micro> sleep_duration {
     static_cast<double>(microseconds)
   };
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     auto start = std::chrono::high_resolution_clock::now();
     // Simulate some useful workload with a sleep
     std::this_thread::sleep_for(sleep_duration);
-    auto end   = std::chrono::high_resolution_clock::now();
+    auto end = std::chrono::high_resolution_clock::now();
 
     auto elapsed_seconds =
       std::chrono::duration_cast<std::chrono::duration<double>>(
@@ -277,14 +1037,29 @@ static void BM_ManualTiming(benchmark::State& state) {
 BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
 ```
 
-### Preventing optimisation
+<a name="setting-the-time-unit" />
+
+### Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+<a name="preventing-optimization" />
+
+### Preventing Optimization
+
 To prevent a value or expression from being optimized away by the compiler
 the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
 functions can be used.
 
 ```c++
 static void BM_test(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
       int x = 0;
       for (int i=0; i < 64; ++i) {
         benchmark::DoNotOptimize(x += i);
@@ -323,7 +1098,7 @@ away.
 
 ```c++
 static void BM_vector_push_back(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::vector<int> v;
     v.reserve(1);
     benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
@@ -333,26 +1108,12 @@ static void BM_vector_push_back(benchmark::State& state) {
 }
 ```
 
-Note that `ClobberMemory()` is only available for GNU based compilers.
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
 
-### Set time unit manually
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
+<a name="reporting-statistics" />
 
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
+### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
 
-## Controlling number of iterations
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
-the registered benchmark object.
-
-## Reporting the mean and standard devation by repeated benchmarks
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
 of the overall behavior. For this reason it's possible to repeatedly rerun the
@@ -360,50 +1121,95 @@ benchmark.
 
 The number of runs of each benchmark is specified globally by the
 `--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run
-more than once the mean and standard deviation of the runs will be reported.
+`Repetitions` on the registered benchmark object. When a benchmark is run more
+than once the mean, median and standard deviation of the runs will be reported.
+
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median and standard deviation, maybe complexity
+measurements if they were requested) of the runs is reported, to both the
+reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
 
-## Fixtures
-Fixture tests are created by
-first defining a type that derives from ::benchmark::Fixture and then
-creating/registering the tests using the following macros:
+<a name="custom-statistics" />
 
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
+### Custom Statistics
 
-For Example:
+While having mean, median and standard deviation is nice, this may not be
+enough for everyone. For example you may want to know what the largest
+observation is, e.g. because you have some real-time constraints. This is easy.
+The following code will specify a custom statistic to be calculated, defined
+by a lambda function.
 
 ```c++
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
-     ...
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
   }
 }
 
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   while (st.KeepRunning()) {
-     ...
-  }
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
+    return *(std::max_element(std::begin(v), std::end(v)));
+  })
+  ->Arg(512);
+```
+
+<a name="using-register-benchmark" />
+
+### Using RegisterBenchmark(name, fn, args...)
+
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
+
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
 }
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
 ```
 
-## Exiting Benchmarks in Error
+<a name="exiting-with-an-error" />
+
+### Exiting with an Error
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
 `State::SkipWithError(const char* msg)` function can be used to skip that run
 of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. Users may explicitly return to exit the
-benchmark immediately.
+`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
+Users must explicitly exit the loop, otherwise all iterations will be performed.
+Users may explicitly return to exit the benchmark immediately.
 
 The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the `KeepRunning()` loop.
+including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
+has been used, it is not required to reach the benchmark loop and one may return
+from the benchmark function early.
 
 For example:
 
@@ -411,100 +1217,107 @@ For example:
 static void BM_test(benchmark::State& state) {
   auto resource = GetResource();
   if (!resource.good()) {
-      state.SkipWithError("Resource is not good!");
-      // KeepRunning() loop will not be entered.
+    state.SkipWithError("Resource is not good!");
+    // KeepRunning() loop will not be entered.
   }
   while (state.KeepRunning()) {
-      auto data = resource.read_data();
-      if (!resource.good()) {
-        state.SkipWithError("Failed to read data!");
-        break; // Needed to skip the rest of the iteration.
-     }
-     do_stuff(data);
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // Needed to skip the rest of the iteration.
+    }
+    do_stuff(data);
+  }
+}
+
+static void BM_test_ranged_fo(benchmark::State & state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    return; // Early return is allowed when SkipWithError() has been used.
+  }
+  for (auto _ : state) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // REQUIRED to prevent all further iterations.
+    }
+    do_stuff(data);
   }
 }
 ```
+<a name="a-faster-keep-running-loop" />
 
-## Output Formats
-The library supports multiple output formats. Use the
-`--benchmark_format=<tabular|json|csv>` flag to set the format type. `tabular` is
-the default format.
+### A Faster KeepRunning Loop
 
-The Tabular format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the 
-tabular data on stdout. Example tabular output looks like:
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
 
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of ever benchmark run. Example json
-output looks like:
-``` json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
 }
+BENCHMARK(BM_Fast);
 ```
 
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
+
+For example, an empty inner loop of using the ranged-based for method looks like:
+
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
 ```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+
+Compared to an empty `KeepRunning` loop, which looks like:
+
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
 ```
 
-## Debug vs Release
-By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
 
-```
-cmake -DCMAKE_BUILD_TYPE=Release
-```
+<a name="disabling-cpu-frequency-scaling" />
 
-To enable link-time optimisation, use
+### Disabling CPU Frequency Scaling
+
+If you see this error:
 
 ```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
 ```
 
-## Linking against the library
-When using gcc, it is necessary to link against pthread to avoid runtime exceptions. This is due to how gcc implements std::thread. See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
+you might want to disable the CPU frequency scaling while running the benchmark:
+
+```bash
+sudo cpupower frequency-set --governor performance
+./mybench
+sudo cpupower frequency-set --governor powersave
+```
diff --git a/README.version b/README.version
deleted file mode 100644
index ab86d09..0000000
--- a/README.version
+++ /dev/null
@@ -1,4 +0,0 @@
-URL: https://github.com/google/benchmark
-Version: 8da907c2c2786685c7da9f4759de052e3990f6f1
-BugComponent: 119451
-Owners: enh, android-bionic
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..631f3ba
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,51 @@
+workspace(name = "com_github_google_benchmark")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
+    sha256 = "d7dc12c1d5bc1a87474de8e3d17b7731a4dcebcfb8aa3990fe8ac7734ef12f2f",
+)
+
+http_archive(
+    name = "com_google_absl",
+    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+    strip_prefix = "abseil-cpp-20200225.2",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+)
+
+http_archive(
+    name = "com_google_googletest",
+    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+    sha256 = "8f827dd550db8b4fdf73904690df0be9fccc161017c9038a724bc9a0617a1bc8",
+)
+
+http_archive(
+    name = "pybind11",
+    build_file = "@//bindings/python:pybind11.BUILD",
+    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
+    strip_prefix = "pybind11-2.4.3",
+    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
+)
+
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
+)
+
+http_archive(
+    name = "rules_python",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+)
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "py_deps",
+   requirements = "//:requirements.txt",
+)
diff --git a/_config.yml b/_config.yml
new file mode 100644
index 0000000..1885487
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
+\ No newline at end of file
diff --git a/appveyor.yml b/appveyor.yml
index 13be7fa..81da955 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,127 +1,50 @@
 version: '{build}'
 
-configuration:
-  - Static Debug
-  - Static Release
-#  - Shared Debug
-#  - Shared Release
+image: Visual Studio 2017
 
-platform:
-  - x86
-  - x64
+configuration:
+  - Debug
+  - Release
 
 environment:
   matrix:
-    - compiler: msvc-12-seh
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017"
+
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017 Win64"
+
     - compiler: msvc-14-seh
-    - compiler: gcc-4.9.2-posix
-#    - compiler: gcc-4.8.4-posix
+      generator: "Visual Studio 14 2015"
 
-artifacts:
-  - path: '_build/CMakeFiles/*.log'
-    name: logs
-  - path: '_build/Testing/**/*.xml'
-    name: test_results
+    - compiler: msvc-14-seh
+      generator: "Visual Studio 14 2015 Win64"
 
-install:
-  # derive some extra information
-  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "linkage=%%a")
-  - for /f "tokens=1-2" %%a in ("%configuration%") do (@set "variant=%%b")
-  - if "%linkage%"=="Shared" (set shared=YES) else (set shared=NO)
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_name=%%a")
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_version=%%b")
-  - for /f "tokens=1-3 delims=-" %%a in ("%compiler%") do (@set "compiler_threading=%%c")
-  - if "%platform%"=="x64" (set arch=x86_64)
-  - if "%platform%"=="x86" (set arch=i686)
-  # download the specific version of MinGW
-  - if "%compiler_name%"=="gcc" (for /f %%a in ('python mingw.py --quiet --version "%compiler_version%" --arch "%arch%" --threading "%compiler_threading%" --location "C:\mingw-builds"') do @set "compiler_path=%%a")
+    - compiler: gcc-5.3.0-posix
+      generator: "MinGW Makefiles"
+      cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
 
-before_build:
-  # Set up mingw commands
-  - if "%compiler_name%"=="gcc" (set "generator=MinGW Makefiles")
-  - if "%compiler_name%"=="gcc" (set "build=mingw32-make -j4")
-  - if "%compiler_name%"=="gcc" (set "test=mingw32-make CTEST_OUTPUT_ON_FAILURE=1 test")
-  # msvc specific commands
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="12" if "%platform%"=="x86" (set "generator=Visual Studio 12 2013")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="12" if "%platform%"=="x64" (set "generator=Visual Studio 12 2013 Win64")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="14" if "%platform%"=="x86" (set "generator=Visual Studio 14 2015")
-  - if "%compiler_name%"=="msvc" if "%compiler_version%"=="14" if "%platform%"=="x64" (set "generator=Visual Studio 14 2015 Win64")
-  - if "%compiler_name%"=="msvc" (set "build=cmake --build . --config %variant%")
-  - if "%compiler_name%"=="msvc" (set "test=ctest -c Release -D CTEST_OUTPUT_ON_FAILURE:STRING=1")
-  # add the compiler path if needed
-  - if not "%compiler_path%"=="" (set "PATH=%PATH%;%compiler_path%")
+matrix:
+  fast_finish: true
+
+install:
   # git bash conflicts with MinGW makefiles
   - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
+  - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
 
 build_script:
-- ps: |
-    md _build -Force
-    cd _build
-    & cmake -G "$env:generator" "-DCMAKE_BUILD_TYPE=$env:variant" "-DBUILD_SHARED_LIBS=$env:shared" ..
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
-    iex "& $env:build"
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
+  - md _build -Force
+  - cd _build
+  - echo %configuration%
+  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON ..
+  - cmake --build . --config %configuration%
 
 test_script:
-- ps: |
-    iex "& $env:test"
-    if ($LastExitCode -ne 0) {
-        throw "Exec: $ErrorMessage"
-    }
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure
 
-    function Add-CTest-Result($testResult)
-    {
-        $tests = ([xml](get-content $testResult)).Site.Testing
-        $testsCount = 0
-        $anyFailures = $FALSE
-
-        foreach ($test in $tests.test) {
-            $testsCount++
-            $testName = $test.Name
-            $testpath = $test.Path
-            $timeNode = $test.SelectSingleNode('Results/NamedMeasurement[@name="Execution Time"]/Value')
-            if ($test.status -eq "failure") {
-                $time = ([double]$timeNode.InnerText * 1000)
-                Add-AppveyorTest $testName -Outcome Failed -FileName $testpath -Duration $time -ErrorMessage $($test.results.measurement.value)
-                Add-AppveyorMessage `"$testName failed`" -Category Error
-                $anyFailures = $TRUE
-            }
-            elseif ($test.status -eq "skipped") {
-                Add-AppveyorTest $testName -Outcome Ignored -Filename $testpath
-            }
-            else {
-                $time = ([double]$timeNode.InnerText * 1000)
-                Add-AppveyorTest $testName -Outcome Passed -FileName $testpath -Duration $time -StdOut $($test.results.measurement.value)
-            }
-        }
-        return $testsCount, $anyFailures
-    }
-
-    $testsCount = 0
-    $anyFailures = $FALSE
-
-    # Run tests and upload results to AppVeyor one by one
-    Get-ChildItem ".\Testing\*.xml" -Recurse | foreach {
-        $testfile = $_.fullname
-        $count, $testsResult = Add-CTest-Result $testfile
-        Write-Host "Found $testfile with $count tests"
-        $testsCount = $testsCount + $count
-        $anyFailures = $anyFailures -or $testsResult
-    }
-
-    Write-Host "There are $testsCount tests found"
-
-    if ($anyFailures -eq $TRUE){
-        Write-Host "Failing build as there are broken tests"
-        $host.SetShouldExit(1)
-    }
-
-matrix:
-  fast_finish: true
-
-cache:
-  - C:\mingw-builds
+artifacts:
+  - path: '_build/CMakeFiles/*.log'
+    name: logs
+  - path: '_build/Testing/**/*.xml'
+    name: test_results
diff --git a/bindings/python/BUILD b/bindings/python/BUILD
new file mode 100644
index 0000000..9559a76
--- /dev/null
+++ b/bindings/python/BUILD
@@ -0,0 +1,3 @@
+exports_files(glob(["*.BUILD"]))
+exports_files(["build_defs.bzl"])
+
diff --git a/bindings/python/build_defs.bzl b/bindings/python/build_defs.bzl
new file mode 100644
index 0000000..45907aa
--- /dev/null
+++ b/bindings/python/build_defs.bzl
@@ -0,0 +1,25 @@
+_SHARED_LIB_SUFFIX = {
+    "//conditions:default": ".so",
+    "//:windows": ".dll",
+}
+
+def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
+    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
+        shared_lib_name = name + shared_lib_suffix
+        native.cc_binary(
+            name = shared_lib_name,
+            linkshared = 1,
+            linkstatic = 1,
+            srcs = srcs + hdrs,
+            copts = copts,
+            features = features,
+            deps = deps,
+        )
+
+    return native.py_library(
+        name = name,
+        data = select({
+            platform: [name + shared_lib_suffix]
+            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
+        }),
+    )
diff --git a/bindings/python/google_benchmark/BUILD b/bindings/python/google_benchmark/BUILD
new file mode 100644
index 0000000..3c1561f
--- /dev/null
+++ b/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,38 @@
+load("//bindings/python:build_defs.bzl", "py_extension")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+        # pip; absl:app
+    ],
+)
+
+py_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "//:benchmark",
+        "@pybind11",
+        "@python_headers",
+    ],
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
+
diff --git a/bindings/python/google_benchmark/__init__.py b/bindings/python/google_benchmark/__init__.py
new file mode 100644
index 0000000..f31285e
--- /dev/null
+++ b/bindings/python/google_benchmark/__init__.py
@@ -0,0 +1,158 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+
+from absl import app
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter,
+    kNanosecond,
+    kMicrosecond,
+    kMillisecond,
+    kSecond,
+    oNone,
+    o1,
+    oN,
+    oNSquared,
+    oNCubed,
+    oLogN,
+    oNLogN,
+    oAuto,
+    oLambda,
+)
+
+
+__all__ = [
+    "register",
+    "main",
+    "Counter",
+    "kNanosecond",
+    "kMicrosecond",
+    "kMillisecond",
+    "kSecond",
+    "oNone",
+    "o1",
+    "oN",
+    "oNSquared",
+    "oNCubed",
+    "oLogN",
+    "oNLogN",
+    "oAuto",
+    "oLambda",
+]
+
+__version__ = "0.2.0"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a decorator
+                # and needs a final call to @regiser
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instanciate an object, even if stateless, to be able to use __getattr__
+# on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of Options
+    # (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
diff --git a/bindings/python/google_benchmark/benchmark.cc b/bindings/python/google_benchmark/benchmark.cc
new file mode 100644
index 0000000..d80816e
--- /dev/null
+++ b/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,181 @@
+// Benchmark for Python.
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "pybind11/operators.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+#include "benchmark/benchmark.h"
+
+PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace py = ::pybind11;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
+                                                  py::function f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+PYBIND11_MODULE(_benchmark, m) {
+  using benchmark::TimeUnit;
+  py::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  py::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  py::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer tor the current object, reference
+      // return policy is used to ask pybind not to take ownership oof the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
+      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
+      .def("args", &Benchmark::Args, py::return_value_policy::reference)
+      .def("range", &Benchmark::Range, py::return_value_policy::reference,
+           py::arg("start"), py::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           py::return_value_policy::reference, py::arg("start"),
+           py::arg("limit"), py::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           py::return_value_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           py::return_value_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
+           py::arg("lo2"), py::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           py::return_value_policy::reference)
+      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           py::return_value_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           py::return_value_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           py::return_value_policy::reference, py::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           py::return_value_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           py::return_value_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           py::return_value_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          py::return_value_policy::reference,
+          py::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  py::class_<Counter> py_counter(m, "Counter");
+
+  py::enum_<Counter::Flags>(py_counter, "Flags")
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values()
+      .def(py::self | py::self);
+
+  py::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(py::init<double, Counter::Flags, Counter::OneK>(),
+           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
+           py::arg("k") = Counter::kIs1000)
+      .def(py::init([](double value) { return Counter(value); }))
+      .def_readwrite("value", &Counter::value)
+      .def_readwrite("flags", &Counter::flags)
+      .def_readwrite("oneK", &Counter::oneK);
+  py::implicitly_convertible<py::float_, Counter>();
+  py::implicitly_convertible<py::int_, Counter>();
+
+  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  py::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_property_readonly("error_occured", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_property("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_property("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_property("items_processed", &State::items_processed,
+                    &State::SetItemsProcessed)
+      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
+      .def("range", &State::range, py::arg("pos") = 0)
+      .def_property_readonly("iterations", &State::iterations)
+      .def_readwrite("counters", &State::counters)
+      .def_readonly("thread_index", &State::thread_index)
+      .def_readonly("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        py::return_value_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+};
+}  // namespace
diff --git a/bindings/python/google_benchmark/example.py b/bindings/python/google_benchmark/example.py
new file mode 100644
index 0000000..9134e8c
--- /dev/null
+++ b/bindings/python/google_benchmark/example.py
@@ -0,0 +1,136 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    ...  # Benchmark code would be here.
+
+
+@benchmark.register
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect cutom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        pass
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
diff --git a/bindings/python/pybind11.BUILD b/bindings/python/pybind11.BUILD
new file mode 100644
index 0000000..bc83350
--- /dev/null
+++ b/bindings/python/pybind11.BUILD
@@ -0,0 +1,20 @@
+cc_library(
+    name = "pybind11",
+    hdrs = glob(
+        include = [
+            "include/pybind11/*.h",
+            "include/pybind11/detail/*.h",
+        ],
+        exclude = [
+            "include/pybind11/common.h",
+            "include/pybind11/eigen.h",
+        ],
+    ),
+    copts = [
+        "-fexceptions",
+        "-Wno-undefined-inline",
+        "-Wno-pragma-once-outside-header",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
diff --git a/bindings/python/python_headers.BUILD b/bindings/python/python_headers.BUILD
new file mode 100644
index 0000000..9c34cf6
--- /dev/null
+++ b/bindings/python/python_headers.BUILD
@@ -0,0 +1,6 @@
+cc_library(
+    name = "python_headers",
+    hdrs = glob(["**/*.h"]),
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt
new file mode 100644
index 0000000..f5bbe7e
--- /dev/null
+++ b/bindings/python/requirements.txt
@@ -0,0 +1,2 @@
+absl-py>=0.7.1
+
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 870f11a..d0d2099 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -19,19 +19,56 @@ set(__add_cxx_compiler_flag INCLUDED)
 
 include(CheckCXXCompilerFlag)
 
-function(add_cxx_compiler_flag FLAG)
+function(mangle_compiler_flag FLAG OUTPUT)
   string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
   string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
-  check_cxx_compiler_flag("" ${SANITIZED_FLAG})
-  if(${SANITIZED_FLAG})
+  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
+endfunction(mangle_compiler_flag)
+
+function(add_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(add_required_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
     set(VARIANT ${ARGV1})
     if(ARGV1)
       string(TOUPPER "_${VARIANT}" VARIANT)
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
   endif()
 endfunction()
 
+function(check_cxx_warning_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  # Add -Werror to ensure the compiler generates an error if the warning flag
+  # doesn't exist.
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+endfunction()
diff --git a/cmake/CXXFeatureCheck.cmake b/cmake/CXXFeatureCheck.cmake
index 3059024..62e6741 100644
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@@ -10,7 +10,7 @@
 #
 # include(CXXFeatureCheck)
 # cxx_feature_check(STD_REGEX)
-# Requires CMake 2.6+
+# Requires CMake 2.8.12+
 
 if(__cxx_feature_check)
   return()
@@ -22,21 +22,48 @@ function(cxx_feature_check FILE)
   string(TOUPPER ${FILE} VAR)
   string(TOUPPER "HAVE_${VAR}" FEATURE)
   if (DEFINED HAVE_${VAR})
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
     return()
   endif()
-  message("-- Performing Test ${FEATURE}")
-  try_run(RUN_${FEATURE} COMPILE_${FEATURE}
-          ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp)
+
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
+  endif()
+
+  if (NOT DEFINED COMPILE_${FEATURE})
+    message(STATUS "Performing Test ${FEATURE}")
+    if(CMAKE_CROSSCOMPILING)
+      try_compile(COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+      if(COMPILE_${FEATURE})
+        message(WARNING
+              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
+      else()
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
+      endif()
+    else()
+      message(STATUS "Performing Test ${FEATURE}")
+      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+    endif()
+  endif()
+
   if(RUN_${FEATURE} EQUAL 0)
-    message("-- Performing Test ${FEATURE} -- success")
-    set(HAVE_${VAR} 1 CACHE INTERNAL "Feature test for ${FILE}" PARENT_SCOPE)
+    message(STATUS "Performing Test ${FEATURE} -- success")
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message("-- Performing Test ${FEATURE} -- failed to compile")
+      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
     else()
-      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
+      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
   endif()
 endfunction()
-
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
new file mode 100644
index 0000000..6e9256e
--- /dev/null
+++ b/cmake/Config.cmake.in
@@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/cmake/GetGitVersion.cmake b/cmake/GetGitVersion.cmake
index 8dd9480..4f10f22 100644
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@@ -21,6 +21,7 @@ set(__get_git_version INCLUDED)
 function(get_git_version var)
   if(GIT_EXECUTABLE)
       execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
           OUTPUT_VARIABLE GIT_VERSION
           ERROR_QUIET)
@@ -33,9 +34,11 @@ function(get_git_version var)
 
       # Work out if the repository is dirty
       execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_QUIET
           ERROR_QUIET)
       execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_VARIABLE GIT_DIFF_INDEX
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
@@ -46,6 +49,6 @@ function(get_git_version var)
       set(GIT_VERSION "v0.0.0")
   endif()
 
-  message("-- git Version: ${GIT_VERSION}")
+  message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/GoogleTest.cmake b/cmake/GoogleTest.cmake
new file mode 100644
index 0000000..dd611fc
--- /dev/null
+++ b/cmake/GoogleTest.cmake
@@ -0,0 +1,41 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/cmake/GoogleTest.cmake.in b/cmake/GoogleTest.cmake.in
new file mode 100644
index 0000000..fd957ff
--- /dev/null
+++ b/cmake/GoogleTest.cmake.in
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+  else()
+    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
diff --git a/cmake/Modules/FindLLVMAr.cmake b/cmake/Modules/FindLLVMAr.cmake
new file mode 100644
index 0000000..2346981
--- /dev/null
+++ b/cmake/Modules/FindLLVMAr.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMAR_EXECUTABLE
+  NAMES llvm-ar
+  DOC "The llvm-ar executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMAr
+  DEFAULT_MSG
+  LLVMAR_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMAr PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-ar.html
+  DESCRIPTION "create, modify, and extract from archives"
+)
diff --git a/cmake/Modules/FindLLVMNm.cmake b/cmake/Modules/FindLLVMNm.cmake
new file mode 100644
index 0000000..e56430a
--- /dev/null
+++ b/cmake/Modules/FindLLVMNm.cmake
@@ -0,0 +1,16 @@
+include(FeatureSummary)
+
+find_program(LLVMNM_EXECUTABLE
+  NAMES llvm-nm
+  DOC "The llvm-nm executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMNm
+  DEFAULT_MSG
+  LLVMNM_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMNm PROPERTIES
+  URL https://llvm.org/docs/CommandGuide/llvm-nm.html
+  DESCRIPTION "list LLVM bitcode and object file’s symbol table"
+)
diff --git a/cmake/Modules/FindLLVMRanLib.cmake b/cmake/Modules/FindLLVMRanLib.cmake
new file mode 100644
index 0000000..7b53e1a
--- /dev/null
+++ b/cmake/Modules/FindLLVMRanLib.cmake
@@ -0,0 +1,15 @@
+include(FeatureSummary)
+
+find_program(LLVMRANLIB_EXECUTABLE
+  NAMES llvm-ranlib
+  DOC "The llvm-ranlib executable"
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LLVMRanLib
+  DEFAULT_MSG
+  LLVMRANLIB_EXECUTABLE)
+
+SET_PACKAGE_PROPERTIES(LLVMRanLib PROPERTIES
+  DESCRIPTION "generate index for LLVM archive"
+)
diff --git a/cmake/benchmark.pc.in b/cmake/benchmark.pc.in
new file mode 100644
index 0000000..34beb01
--- /dev/null
+++ b/cmake/benchmark.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework
+Version: @VERSION@
+
+Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread
+Cflags: -I${includedir}
diff --git a/cmake/llvm-toolchain.cmake b/cmake/llvm-toolchain.cmake
new file mode 100644
index 0000000..fc119e5
--- /dev/null
+++ b/cmake/llvm-toolchain.cmake
@@ -0,0 +1,8 @@
+find_package(LLVMAr REQUIRED)
+set(CMAKE_AR "${LLVMAR_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMNm REQUIRED)
+set(CMAKE_NM "${LLVMNM_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMRanLib REQUIRED)
+set(CMAKE_RANLIB "${LLVMRANLIB_EXECUTABLE}" CACHE FILEPATH "" FORCE)
diff --git a/cmake/split_list.cmake b/cmake/split_list.cmake
new file mode 100644
index 0000000..67aed3f
--- /dev/null
+++ b/cmake/split_list.cmake
@@ -0,0 +1,3 @@
+macro(split_list listname)
+  string(REPLACE ";" " " ${listname} "${${listname}}")
+endmacro()
diff --git a/conan/CMakeLists.txt b/conan/CMakeLists.txt
new file mode 100644
index 0000000..15b92ca
--- /dev/null
+++ b/conan/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(cmake_wrapper)
+
+include(conanbuildinfo.cmake)
+conan_basic_setup()
+
+include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/conan/test_package/CMakeLists.txt b/conan/test_package/CMakeLists.txt
new file mode 100644
index 0000000..089a6c7
--- /dev/null
+++ b/conan/test_package/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(test_package)
+
+set(CMAKE_VERBOSE_MAKEFILE TRUE)
+
+include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+conan_basic_setup()
+
+add_executable(${PROJECT_NAME} test_package.cpp)
+target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/conan/test_package/conanfile.py b/conan/test_package/conanfile.py
new file mode 100644
index 0000000..d63f408
--- /dev/null
+++ b/conan/test_package/conanfile.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from conans import ConanFile, CMake
+import os
+
+
+class TestPackageConan(ConanFile):
+    settings = "os", "compiler", "build_type", "arch"
+    generators = "cmake"
+
+    def build(self):
+        cmake = CMake(self)
+        cmake.configure()
+        cmake.build()
+
+    def test(self):
+        bin_path = os.path.join("bin", "test_package")
+        self.run(bin_path, run_environment=True)
diff --git a/conan/test_package/test_package.cpp b/conan/test_package/test_package.cpp
new file mode 100644
index 0000000..4fa7ec0
--- /dev/null
+++ b/conan/test_package/test_package.cpp
@@ -0,0 +1,18 @@
+#include "benchmark/benchmark.h"
+
+void BM_StringCreation(benchmark::State& state) {
+    while (state.KeepRunning())
+        std::string empty_string;
+}
+
+BENCHMARK(BM_StringCreation);
+
+void BM_StringCopy(benchmark::State& state) {
+    std::string x = "hello";
+    while (state.KeepRunning())
+        std::string copy(x);
+}
+
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
diff --git a/conanfile.py b/conanfile.py
new file mode 100644
index 0000000..e31fc52
--- /dev/null
+++ b/conanfile.py
@@ -0,0 +1,79 @@
+from conans import ConanFile, CMake, tools
+from conans.errors import ConanInvalidConfiguration
+import shutil
+import os
+
+
+class GoogleBenchmarkConan(ConanFile):
+    name = "benchmark"
+    description = "A microbenchmark support library."
+    topics = ("conan", "benchmark", "google", "microbenchmark")
+    url = "https://github.com/google/benchmark"
+    homepage = "https://github.com/google/benchmark"
+    author = "Google Inc."
+    license = "Apache-2.0"
+    exports_sources = ["*"]
+    generators = "cmake"
+
+    settings = "arch", "build_type", "compiler", "os"
+    options = {
+        "shared": [True, False],
+        "fPIC": [True, False],
+        "enable_lto": [True, False],
+        "enable_exceptions": [True, False]
+    }
+    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
+
+    _build_subfolder = "."
+
+    def source(self):
+        # Wrap the original CMake file to call conan_basic_setup
+        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
+        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
+
+    def config_options(self):
+        if self.settings.os == "Windows":
+            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
+                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
+            del self.options.fPIC
+
+    def configure(self):
+        if self.settings.os == "Windows" and self.options.shared:
+            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
+
+    def _configure_cmake(self):
+        cmake = CMake(self)
+
+        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
+
+        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
+        if self.settings.os != "Windows":
+            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
+        else:
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
+
+        cmake.configure(build_folder=self._build_subfolder)
+        return cmake
+
+    def build(self):
+        cmake = self._configure_cmake()
+        cmake.build()
+
+    def package(self):
+        cmake = self._configure_cmake()
+        cmake.install()
+
+        self.copy(pattern="LICENSE", dst="licenses")
+
+    def package_info(self):
+        self.cpp_info.libs = tools.collect_libs(self)
+        if self.settings.os == "Linux":
+            self.cpp_info.libs.extend(["pthread", "rt"])
+        elif self.settings.os == "Windows":
+            self.cpp_info.libs.append("shlwapi")
+        elif self.settings.os == "SunOS":
+            self.cpp_info.libs.append("kstat")
diff --git a/dependencies.md b/dependencies.md
new file mode 100644
index 0000000..6289b4e
--- /dev/null
+++ b/dependencies.md
@@ -0,0 +1,18 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable AND
+* The last two Ubuntu LTS releases AND
+
+Currently, this means using build tool versions that are available for Ubuntu
+16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+
+_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+release, as `cmake3`._
diff --git a/docs/AssemblyTests.md b/docs/AssemblyTests.md
new file mode 100644
index 0000000..1fbdc26
--- /dev/null
+++ b/docs/AssemblyTests.md
@@ -0,0 +1,147 @@
+# Assembly Tests
+
+The Benchmark library provides a number of functions whose primary
+purpose in to affect assembly generation, including `DoNotOptimize`
+and `ClobberMemory`. In addition there are other functions,
+such as `KeepRunning`, for which generating good assembly is paramount.
+
+For these functions it's important to have tests that verify the
+correctness and quality of the implementation. This requires testing
+the code generated by the compiler.
+
+This document describes how the Benchmark library tests compiler output,
+as well as how to properly write new tests.
+
+
+## Anatomy of a Test
+
+Writing a test has two steps:
+
+* Write the code you want to generate assembly for.
+* Add `// CHECK` lines to match against the verified assembly.
+
+Example:
+```c++
+
+// CHECK-LABEL: test_add:
+extern "C" int test_add() {
+    extern int ExternInt;
+    return ExternInt + 1;
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: addl %eax
+    // CHECK: ret
+}
+
+```
+
+#### LLVM Filecheck
+
+[LLVM's Filecheck](https://llvm.org/docs/CommandGuide/FileCheck.html)
+is used to test the generated assembly against the `// CHECK` lines
+specified in the tests source file. Please see the documentation
+linked above for information on how to write `CHECK` directives.
+
+#### Tips and Tricks:
+
+* Tests should match the minimal amount of output required to establish
+correctness. `CHECK` directives don't have to match on the exact next line
+after the previous match, so tests should omit checks for unimportant
+bits of assembly. ([`CHECK-NEXT`](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-next-directive)
+can be used to ensure a match occurs exactly after the previous match).
+
+* The tests are compiled with `-O3 -g0`. So we're only testing the
+optimized output.
+
+* The assembly output is further cleaned up using `tools/strip_asm.py`.
+This removes comments, assembler directives, and unused labels before
+the test is run.
+
+* The generated and stripped assembly file for a test is output under
+`<build-directory>/test/<test-name>.s`
+
+* Filecheck supports using [`CHECK` prefixes](https://llvm.org/docs/CommandGuide/FileCheck.html#cmdoption-check-prefixes)
+to specify lines that should only match in certain situations.
+The Benchmark tests use `CHECK-CLANG` and `CHECK-GNU` for lines that
+are only expected to match Clang or GCC's output respectively. Normal
+`CHECK` lines match against all compilers. (Note: `CHECK-NOT` and
+`CHECK-LABEL` are NOT prefixes. They are versions of non-prefixed
+`CHECK` lines)
+
+* Use `extern "C"` to disable name mangling for specific functions. This
+makes them easier to name in the `CHECK` lines.
+
+
+## Problems Writing Portable Tests
+
+Writing tests which check the code generated by a compiler are
+inherently non-portable. Different compilers and even different compiler
+versions may generate entirely different code. The Benchmark tests
+must tolerate this.
+
+LLVM Filecheck provides a number of mechanisms to help write
+"more portable" tests; including [matching using regular expressions](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-pattern-matching-syntax),
+allowing the creation of [named variables](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-variables)
+for later matching, and [checking non-sequential matches](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-dag-directive).
+
+#### Capturing Variables
+
+For example, say GCC stores a variable in a register but Clang stores
+it in memory. To write a test that tolerates both cases we "capture"
+the destination of the store, and then use the captured expression
+to write the remainder of the test.
+
+```c++
+// CHECK-LABEL: test_div_no_op_into_shr:
+extern "C" void test_div_no_op_into_shr(int value) {
+    int divisor = 2;
+    benchmark::DoNotOptimize(divisor); // hide the value from the optimizer
+    return value / divisor;
+
+    // CHECK: movl $2, [[DEST:.*]]
+    // CHECK: idivl [[DEST]]
+    // CHECK: ret
+}
+```
+
+#### Using Regular Expressions to Match Differing Output
+
+Often tests require testing assembly lines which may subtly differ
+between compilers or compiler versions. A common example of this
+is matching stack frame addresses. In this case regular expressions
+can be used to match the differing bits of output. For example:
+
+```c++
+int ExternInt;
+struct Point { int x, y, z; };
+
+// CHECK-LABEL: test_store_point:
+extern "C" void test_store_point() {
+    Point p{ExternInt, ExternInt, ExternInt};
+    benchmark::DoNotOptimize(p);
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: ret
+}
+```
+
+## Current Requirements and Limitations
+
+The tests require Filecheck to be installed along the `PATH` of the
+build machine. Otherwise the tests will be disabled.
+
+Additionally, as mentioned in the previous section, codegen tests are
+inherently non-portable. Currently the tests are limited to:
+
+* x86_64 targets.
+* Compiled with GCC or Clang
+
+Further work could be done, at least on a limited basis, to extend the
+tests to other architectures and compilers (using `CHECK` prefixes).
+
+Furthermore, the tests fail for builds which specify additional flags
+that modify code generation, including `--coverage` or `-fsanitize=`.
+
diff --git a/docs/_config.yml b/docs/_config.yml
new file mode 100644
index 0000000..1885487
--- /dev/null
+++ b/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
+\ No newline at end of file
diff --git a/docs/releasing.md b/docs/releasing.md
new file mode 100644
index 0000000..f0cd701
--- /dev/null
+++ b/docs/releasing.md
@@ -0,0 +1,16 @@
+# How to release
+
+* Make sure you're on master and synced to HEAD
+* Ensure the project builds and tests run (sanity check only, obviously)
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force origin`
diff --git a/docs/tools.md b/docs/tools.md
new file mode 100644
index 0000000..f2d0c49
--- /dev/null
+++ b/docs/tools.md
@@ -0,0 +1,203 @@
+# Benchmark Tools
+
+## compare.py
+
+The `compare.py` can be used to compare the result of benchmarks.
+
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```
+
+### Displaying aggregates only
+
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
+
+### Modes of operation
+
+There are three modes of operation:
+
+1. Just compare two benchmarks
+The program is invoked like:
+
+``` bash
+$ compare.py benchmarks <benchmark_baseline> <benchmark_contender> [benchmark options]...
+```
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarks ./a.out ./a.out
+RUNNING: ./a.out --benchmark_out=/tmp/tmprBT5nW
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:44
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19101577   211.669MB/s
+BM_memcpy/64           76 ns         76 ns    9412571   800.199MB/s
+BM_memcpy/512          84 ns         84 ns    8249070   5.64771GB/s
+BM_memcpy/1024        116 ns        116 ns    6181763   8.19505GB/s
+BM_memcpy/8192        643 ns        643 ns    1062855   11.8636GB/s
+BM_copy/8             222 ns        222 ns    3137987   34.3772MB/s
+BM_copy/64           1608 ns       1608 ns     432758   37.9501MB/s
+BM_copy/512         12589 ns      12589 ns      54806   38.7867MB/s
+BM_copy/1024        25169 ns      25169 ns      27713   38.8003MB/s
+BM_copy/8192       201165 ns     201112 ns       3486   38.8466MB/s
+RUNNING: ./a.out --benchmark_out=/tmp/tmpt1wwG_
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:53
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19397903   211.255MB/s
+BM_memcpy/64           73 ns         73 ns    9691174   839.635MB/s
+BM_memcpy/512          85 ns         85 ns    8312329   5.60101GB/s
+BM_memcpy/1024        118 ns        118 ns    6438774   8.11608GB/s
+BM_memcpy/8192        656 ns        656 ns    1068644   11.6277GB/s
+BM_copy/8             223 ns        223 ns    3146977   34.2338MB/s
+BM_copy/64           1611 ns       1611 ns     435340   37.8751MB/s
+BM_copy/512         12622 ns      12622 ns      54818   38.6844MB/s
+BM_copy/1024        25257 ns      25239 ns      27779   38.6927MB/s
+BM_copy/8192       205013 ns     205010 ns       3479    38.108MB/s
+Comparing ./a.out to ./a.out
+Benchmark                 Time             CPU      Time Old      Time New       CPU Old       CPU New
+------------------------------------------------------------------------------------------------------
+BM_memcpy/8            +0.0020         +0.0020            36            36            36            36
+BM_memcpy/64           -0.0468         -0.0470            76            73            76            73
+BM_memcpy/512          +0.0081         +0.0083            84            85            84            85
+BM_memcpy/1024         +0.0098         +0.0097           116           118           116           118
+BM_memcpy/8192         +0.0200         +0.0203           643           656           643           656
+BM_copy/8              +0.0046         +0.0042           222           223           222           223
+BM_copy/64             +0.0020         +0.0020          1608          1611          1608          1611
+BM_copy/512            +0.0027         +0.0026         12589         12622         12589         12622
+BM_copy/1024           +0.0035         +0.0028         25169         25257         25169         25239
+BM_copy/8192           +0.0191         +0.0194        201165        205013        201112        205010
+```
+
+What it does is for the every benchmark from the first run it looks for the benchmark with exactly the same name in the second run, and then compares the results. If the names differ, the benchmark is omitted from the diff.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+2. Compare two different filters of one benchmark
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark> <filter_baseline> <filter_contender> [benchmark options]...
+```
+Where `<benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py filters ./a.out BM_memcpy BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmpBWKk0k
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:28
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   17891491   211.215MB/s
+BM_memcpy/64           74 ns         74 ns    9400999   825.646MB/s
+BM_memcpy/512          87 ns         87 ns    8027453   5.46126GB/s
+BM_memcpy/1024        111 ns        111 ns    6116853    8.5648GB/s
+BM_memcpy/8192        657 ns        656 ns    1064679   11.6247GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpAvWcOM
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:33
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           227 ns        227 ns    3038700   33.6264MB/s
+BM_copy/64         1640 ns       1640 ns     426893   37.2154MB/s
+BM_copy/512       12804 ns      12801 ns      55417   38.1444MB/s
+BM_copy/1024      25409 ns      25407 ns      27516   38.4365MB/s
+BM_copy/8192     202986 ns     202990 ns       3454   38.4871MB/s
+Comparing BM_memcpy to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.2829         +5.2812            36           227            36           227
+[BM_memcpy vs. BM_copy]/64          +21.1719        +21.1856            74          1640            74          1640
+[BM_memcpy vs. BM_copy]/512        +145.6487       +145.6097            87         12804            87         12801
+[BM_memcpy vs. BM_copy]/1024       +227.1860       +227.1776           111         25409           111         25407
+[BM_memcpy vs. BM_copy]/8192       +308.1664       +308.2898           657        202986           656        202990
+```
+
+As you can see, it applies filter to the benchmarks, both when running the benchmark, and before doing the diff. And to make the diff work, the matches are replaced with some common string. Thus, you can compare two different benchmark families within one benchmark binary.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+3. Compare filter one from benchmark one to filter two from benchmark two:
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark_baseline> <filter_baseline> <benchmark_contender> <filter_contender> [benchmark options]...
+```
+
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarksfiltered ./a.out BM_memcpy ./a.out BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmp_FvbYg
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:27
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            37 ns         37 ns   18953482   204.118MB/s
+BM_memcpy/64           74 ns         74 ns    9206578   828.245MB/s
+BM_memcpy/512          91 ns         91 ns    8086195   5.25476GB/s
+BM_memcpy/1024        120 ns        120 ns    5804513   7.95662GB/s
+BM_memcpy/8192        664 ns        664 ns    1028363   11.4948GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpDfL5iE
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:32
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           230 ns        230 ns    2985909   33.1161MB/s
+BM_copy/64         1654 ns       1653 ns     419408   36.9137MB/s
+BM_copy/512       13122 ns      13120 ns      53403   37.2156MB/s
+BM_copy/1024      26679 ns      26666 ns      26575   36.6218MB/s
+BM_copy/8192     215068 ns     215053 ns       3221   36.3283MB/s
+Comparing BM_memcpy (from ./a.out) to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.1649         +5.1637            37           230            37           230
+[BM_memcpy vs. BM_copy]/64          +21.4352        +21.4374            74          1654            74          1653
+[BM_memcpy vs. BM_copy]/512        +143.6022       +143.5865            91         13122            91         13120
+[BM_memcpy vs. BM_copy]/1024       +221.5903       +221.4790           120         26679           120         26666
+[BM_memcpy vs. BM_copy]/8192       +322.9059       +323.0096           664        215068           664        215053
+```
+This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 18aa9e6..f57e3e7 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -11,11 +11,1598 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  for (auto _ : state)
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  for (auto _ : state)
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->Args({1<<10, 128})
+   ->Args({2<<10, 128})
+   ->Args({4<<10, 128})
+   ->Args({8<<10, 128})
+   ->Args({1<<10, 512})
+   ->Args({2<<10, 512})
+   ->Args({4<<10, 512})
+   ->Args({8<<10, 512});
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
+benchmark. This option overrides the `benchmark_min_time` flag.
+
+void BM_test(benchmark::State& state) {
+ ... body ...
+}
+BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have reached the loop start, and all will have finished before any
+thread exits the loop body. As such, any global setup or teardown you want to
+do can be wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+*/
+
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-#include "macros.h"
-#include "benchmark_api.h"
-#include "reporter.h"
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iosfwd>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if defined(BENCHMARK_HAS_CXX11)
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h>  // for _ReadWriteBarrier
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                         \
+  TypeName& operator=(const TypeName&)
+#else
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;                \
+  TypeName& operator=(const TypeName&) = delete
+#endif
+
+#if defined(__GNUC__)
+#define BENCHMARK_UNUSED __attribute__((unused))
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE __forceinline
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
+#define BENCHMARK_INTERNAL_TOSTRING2(x) #x
+#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#else
+#define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_DEPRECATED_MSG(msg)
+#define BENCHMARK_WARNING_MSG(msg)                           \
+  __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
+      __LINE__) ") : warning note: " msg))
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__GNUC__) || __has_builtin(__builtin_unreachable)
+#define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+#define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
+namespace benchmark {
+class BenchmarkReporter;
+class MemoryManager;
+
+void Initialize(int* argc, char** argv);
+
+// Report to stdout all arguments in 'argv' as unrecognized except the first.
+// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
+bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Generate a list of benchmarks matching the specified --benchmark_filter flag
+// and if --benchmark_list_tests is specified return after printing the name
+// of each matching benchmark. Otherwise run each matching benchmark and
+// report the results.
+//
+// The second and third overload use the specified 'display_reporter' and
+//  'file_reporter' respectively. 'file_reporter' will write to the file
+//  specified
+//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//  'file_reporter' is ignored.
+//
+// RETURNS: The number of matching benchmarks.
+size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter);
+
+// Register a MemoryManager instance that will be used to collect and report
+// allocation measurements for benchmark runs.
+void RegisterMemoryManager(MemoryManager* memory_manager);
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+class BenchmarkFamilies;
+
+void UseCharPointer(char const volatile*);
+
+// Take ownership of the pointer and register the benchmark. Return the
+// registered benchmark.
+Benchmark* RegisterBenchmarkInternal(Benchmark*);
+
+// Ensure that the standard streams are properly initialized in every TU.
+int InitializeStreams();
+BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
+
+}  // namespace internal
+
+#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__pnacl__) || \
+    defined(__EMSCRIPTEN__)
+#define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: https://youtu.be/nXaxk27zwlk?t=2441
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+#if defined(__clang__)
+  asm volatile("" : "+r,m"(value) : : "memory");
+#else
+  asm volatile("" : "+m,r"(value) : : "memory");
+#endif
+}
+
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  asm volatile("" : : : "memory");
+}
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#else
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+#endif
+
+// This class is used for user-defined counters.
+class Counter {
+ public:
+  enum Flags {
+    kDefaults = 0,
+    // Mark the counter as a rate. It will be presented divided
+    // by the duration of the benchmark.
+    kIsRate = 1U << 0U,
+    // Mark the counter as a thread-average quantity. It will be
+    // presented divided by the number of threads.
+    kAvgThreads = 1U << 1U,
+    // Mark the counter as a thread-average rate. See above.
+    kAvgThreadsRate = kIsRate | kAvgThreads,
+    // Mark the counter as a constant value, valid/same for *every* iteration.
+    // When reporting, it will be *multiplied* by the iteration count.
+    kIsIterationInvariant = 1U << 2U,
+    // Mark the counter as a constant rate.
+    // When reporting, it will be *multiplied* by the iteration count
+    // and then divided by the duration of the benchmark.
+    kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
+    // Mark the counter as a iteration-average quantity.
+    // It will be presented divided by the number of iterations.
+    kAvgIterations = 1U << 3U,
+    // Mark the counter as a iteration-average rate. See above.
+    kAvgIterationsRate = kIsRate | kAvgIterations,
+
+    // In the end, invert the result. This is always done last!
+    kInvert = 1U << 31U
+  };
+
+  enum OneK {
+    // 1'000 items per 1k
+    kIs1000 = 1000,
+    // 1'024 items per 1k
+    kIs1024 = 1024
+  };
+
+  double value;
+  Flags flags;
+  OneK oneK;
+
+  BENCHMARK_ALWAYS_INLINE
+  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
+      : value(v), flags(f), oneK(k) {}
+
+  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
+};
+
+// A helper for user code to create unforeseen combinations of Flags, without
+// having to do this cast manually each time, or providing this operator.
+Counter::Flags inline operator|(const Counter::Flags& LHS,
+                                const Counter::Flags& RHS) {
+  return static_cast<Counter::Flags>(static_cast<int>(LHS) |
+                                     static_cast<int>(RHS));
+}
+
+// This is the container for the user-defined counters.
+typedef std::map<std::string, Counter> UserCounters;
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
+
+// BigO is passed to a benchmark in order to specify the asymptotic
+// computational
+// complexity for the benchmark. In case oAuto is selected, complexity will be
+// calculated automatically to the best fit.
+enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
+
+typedef uint64_t IterationCount;
+
+// BigOFunc is passed to a benchmark in order to specify the asymptotic
+// computational complexity for the benchmark.
+typedef double(BigOFunc)(IterationCount);
+
+// StatisticsFunc is passed to a benchmark in order to compute some descriptive
+// statistics over all the measurements of some type
+typedef double(StatisticsFunc)(const std::vector<double>&);
+
+namespace internal {
+struct Statistics {
+  std::string name_;
+  StatisticsFunc* compute_;
+
+  Statistics(const std::string& name, StatisticsFunc* compute)
+      : name_(name), compute_(compute) {}
+};
+
+struct BenchmarkInstance;
+class ThreadTimer;
+class ThreadManager;
+
+enum AggregationReportMode
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#else
+#endif
+{
+  // The mode has not been manually specified
+  ARM_Unspecified = 0,
+  // The mode is user-specified.
+  // This may or may not be set when the following bit-flags are set.
+  ARM_Default = 1U << 0U,
+  // File reporter should only output aggregates.
+  ARM_FileReportAggregatesOnly = 1U << 1U,
+  // Display reporter should only output aggregates
+  ARM_DisplayReportAggregatesOnly = 1U << 2U,
+  // Both reporters should only display aggregates.
+  ARM_ReportAggregatesOnly =
+      ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
+};
+
+}  // namespace internal
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+ public:
+  struct StateIterator;
+  friend struct StateIterator;
+
+  // Returns iterators used to run each iteration of a benchmark using a
+  // C++11 ranged-based for loop. These functions should not be called directly.
+  //
+  // REQUIRES: The benchmark has not started running yet. Neither begin nor end
+  // have been called previously.
+  //
+  // NOTE: KeepRunning may not be used after calling either of these functions.
+  BENCHMARK_ALWAYS_INLINE StateIterator begin();
+  BENCHMARK_ALWAYS_INLINE StateIterator end();
+
+  // Returns true if the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning();
+
+  // Returns true iff the benchmark should run n more iterations.
+  // REQUIRES: 'n' > 0.
+  // NOTE: A benchmark must not return from the test until KeepRunningBatch()
+  // has returned false.
+  // NOTE: KeepRunningBatch() may overshoot by up to 'n' iterations.
+  //
+  // Intended usage:
+  //   while (state.KeepRunningBatch(1000)) {
+  //     // process 1000 elements
+  //   }
+  bool KeepRunningBatch(IterationCount n);
+
+  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after the last iteration of the benchmark loop.
+  //
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
+  // for the current thread.
+  //
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after control flow enters the
+  // benchmark loop.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
+  //            current thread.
+  // Report the benchmark as resulting in an error with the specified 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report an error only the
+  // first error message is used.
+  //
+  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithError(const char* msg);
+
+  // Returns true if an error has been reported with 'SkipWithError(...)'.
+  bool error_occurred() const { return error_occurred_; }
+
+  // REQUIRES: called exactly once per iteration of the benchmarking loop.
+  // Set the manually measured time for this benchmark iteration, which
+  // is used instead of automatically measured time if UseManualTime() was
+  // specified.
+  //
+  // For threaded benchmarks the final value will be set to the largest
+  // reported values.
+  void SetIterationTime(double seconds);
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(int64_t bytes) {
+    counters["bytes_per_second"] =
+        Counter(static_cast<double>(bytes), Counter::kIsRate, Counter::kIs1024);
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t bytes_processed() const {
+    if (counters.find("bytes_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("bytes_per_second"));
+    return 0;
+  }
+
+  // If this routine is called with complexity_n > 0 and complexity report is
+  // requested for the
+  // family benchmark, then current benchmark will be part of the computation
+  // and complexity_n will
+  // represent the length of N.
+  BENCHMARK_ALWAYS_INLINE
+  void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t complexity_length_n() const { return complexity_n_; }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(int64_t items) {
+    counters["items_per_second"] =
+        Counter(static_cast<double>(items), benchmark::Counter::kIsRate);
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t items_processed() const {
+    if (counters.find("items_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("items_per_second"));
+    return 0;
+  }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(benchmark::State& state) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    state.SetLabel(StrFormat("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  void SetLabel(const char* label);
+
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int64_t range(std::size_t pos = 0) const {
+    assert(range_.size() > pos);
+    return range_[pos];
+  }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
+  int64_t range_x() const { return range(0); }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
+  int64_t range_y() const { return range(1); }
+
+  BENCHMARK_ALWAYS_INLINE
+  IterationCount iterations() const {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+      return 0;
+    }
+    return max_iterations - total_iterations_ + batch_leftover_;
+  }
+
+ private
+     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+  // When total_iterations_ is 0, KeepRunning() and friends will return false.
+  // May be larger than max_iterations.
+  IterationCount total_iterations_;
+
+  // When using KeepRunningBatch(), batch_leftover_ holds the number of
+  // iterations beyond max_iters that were run. Used to track
+  // completed_iterations_ accurately.
+  IterationCount batch_leftover_;
+
+ public:
+  const IterationCount max_iterations;
+
+ private:
+  bool started_;
+  bool finished_;
+  bool error_occurred_;
+
+ private:  // items we don't need on the first cache line
+  std::vector<int64_t> range_;
+
+  int64_t complexity_n_;
+
+ public:
+  // Container for user-defined counters.
+  UserCounters counters;
+  // Index of the executing thread. Values from [0, threads).
+  const int thread_index;
+  // Number of threads concurrently executing the benchmark.
+  const int threads;
+
+ private:
+  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+        int thread_i, int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+
+  void StartKeepRunning();
+  // Implementation of KeepRunning() and KeepRunningBatch().
+  // is_batch must be true unless n is 1.
+  bool KeepRunningInternal(IterationCount n, bool is_batch);
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
+
+  friend struct internal::BenchmarkInstance;
+};
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
+  return KeepRunningInternal(1, /*is_batch=*/false);
+}
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) {
+  return KeepRunningInternal(n, /*is_batch=*/true);
+}
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
+                                                               bool is_batch) {
+  // total_iterations_ is set to 0 by the constructor, and always set to a
+  // nonzero value by StartKepRunning().
+  assert(n > 0);
+  // n must be 1 unless is_batch is true.
+  assert(is_batch || n == 1);
+  if (BENCHMARK_BUILTIN_EXPECT(total_iterations_ >= n, true)) {
+    total_iterations_ -= n;
+    return true;
+  }
+  if (!started_) {
+    StartKeepRunning();
+    if (!error_occurred_ && total_iterations_ >= n) {
+      total_iterations_ -= n;
+      return true;
+    }
+  }
+  // For non-batch runs, total_iterations_ must be 0 by now.
+  if (is_batch && total_iterations_ != 0) {
+    batch_leftover_ = n - total_iterations_;
+    total_iterations_ = 0;
+    return true;
+  }
+  FinishKeepRunning();
+  return false;
+}
+
+struct State::StateIterator {
+  struct BENCHMARK_UNUSED Value {};
+  typedef std::forward_iterator_tag iterator_category;
+  typedef Value value_type;
+  typedef Value reference;
+  typedef Value pointer;
+  typedef std::ptrdiff_t difference_type;
+
+ private:
+  friend class State;
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator() : cached_(0), parent_() {}
+
+  BENCHMARK_ALWAYS_INLINE
+  explicit StateIterator(State* st)
+      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+
+ public:
+  BENCHMARK_ALWAYS_INLINE
+  Value operator*() const { return Value(); }
+
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator& operator++() {
+    assert(cached_ > 0);
+    --cached_;
+    return *this;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  bool operator!=(StateIterator const&) const {
+    if (BENCHMARK_BUILTIN_EXPECT(cached_ != 0, true)) return true;
+    parent_->FinishKeepRunning();
+    return false;
+  }
+
+ private:
+  IterationCount cached_;
+  State* const parent_;
+};
+
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::begin() {
+  return StateIterator(this);
+}
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::end() {
+  StartKeepRunning();
+  return StateIterator();
+}
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+ public:
+  virtual ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int64_t x);
+
+  // Run this benchmark with the given time unit for the generated output report
+  Benchmark* Unit(TimeUnit unit);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int64_t start, int64_t limit);
+
+  // Run this benchmark once for all values in the range [start..limit] with
+  // specific step
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int64_t start, int64_t limit, int step = 1);
+
+  // Run this benchmark once with "args" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Args(const std::vector<int64_t>& args);
+
+  // Equivalent to Args({x, y})
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Args'.
+  Benchmark* ArgPair(int64_t x, int64_t y) {
+    std::vector<int64_t> args;
+    args.push_back(x);
+    args.push_back(y);
+    return Args(args);
+  }
+
+  // Run this benchmark once for a number of values picked from the
+  // ranges [start..limit].  (starts and limits are always picked.)
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
+
+  // Run this benchmark once for each combination of values in the (cartesian)
+  // product of the supplied argument lists.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+
+  // Equivalent to ArgNames({name})
+  Benchmark* ArgName(const std::string& name);
+
+  // Set the argument names to display in the benchmark name. If not called,
+  // only argument values will be shown.
+  Benchmark* ArgNames(const std::vector<std::string>& names);
+
+  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Ranges'.
+  Benchmark* RangePair(int64_t lo1, int64_t hi1, int64_t lo2, int64_t hi2) {
+    std::vector<std::pair<int64_t, int64_t> > ranges;
+    ranges.push_back(std::make_pair(lo1, hi1));
+    ranges.push_back(std::make_pair(lo2, hi2));
+    return Ranges(ranges);
+  }
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, Args,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Set the range multiplier for non-dense range. If not called, the range
+  // multiplier kRangeMultiplier will be used.
+  Benchmark* RangeMultiplier(int multiplier);
+
+  // Set the minimum amount of time to use when running this benchmark. This
+  // option overrides the `benchmark_min_time` flag.
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinTime(double t);
+
+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(IterationCount n);
+
+  // Specify the amount of times to repeat this benchmark. This option overrides
+  // the `benchmark_repetitions` flag.
+  // REQUIRES: `n > 0`
+  Benchmark* Repetitions(int n);
+
+  // Specify if each repetition of the benchmark should be reported separately
+  // or if only the final statistics should be reported. If the benchmark
+  // is not repeated then the single result is always reported.
+  // Applies to *ALL* reporters (display and file).
+  Benchmark* ReportAggregatesOnly(bool value = true);
+
+  // Same as ReportAggregatesOnly(), but applies to display reporter only.
+  Benchmark* DisplayAggregatesOnly(bool value = true);
+
+  // By default, the CPU time is measured only for the main thread, which may
+  // be unrepresentative if the benchmark uses threads internally. If called,
+  // the total CPU time spent by all the threads will be measured instead.
+  // By default, the only the main thread CPU time will be measured.
+  Benchmark* MeasureProcessCPUTime();
+
+  // If a particular benchmark should use the Wall clock instead of the CPU time
+  // (be it either the CPU time of the main thread only (default), or the
+  // total CPU usage of the benchmark), call this method. If called, the elapsed
+  // (wall) time will be used to control how many iterations are run, and in the
+  // printing of items/second or MB/seconds values.
+  // If not called, the CPU time used by the benchmark will be used.
+  Benchmark* UseRealTime();
+
+  // If a benchmark must measure time manually (e.g. if GPU execution time is
+  // being
+  // measured), call this method. If called, each benchmark iteration should
+  // call
+  // SetIterationTime(seconds) to report the measured time, which will be used
+  // to control how many iterations are run, and in the printing of items/second
+  // or MB/second values.
+  Benchmark* UseManualTime();
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigOFunc* complexity);
+
+  // Add this statistics to be computed over all the values of benchmark run
+  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // For each value n in the range, run this benchmark once using n threads.
+  // min_threads and max_threads are always included in the range.
+  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
+  // a benchmark with 1, 4, 7 and 8 threads.
+  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  virtual void Run(State& state) = 0;
+
+ protected:
+  explicit Benchmark(const char* name);
+  Benchmark(Benchmark const&);
+  void SetName(const char* name);
+
+  int ArgsCnt() const;
+
+ private:
+  friend class BenchmarkFamilies;
+
+  std::string name_;
+  AggregationReportMode aggregation_report_mode_;
+  std::vector<std::string> arg_names_;       // Args for all benchmark runs
+  std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
+  double min_time_;
+  IterationCount iterations_;
+  int repetitions_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  std::vector<Statistics> statistics_;
+  std::vector<int> thread_counts_;
+
+  Benchmark& operator=(Benchmark const&);
+};
+
+}  // namespace internal
+
+// Create and register a benchmark with the specified 'name' that invokes
+// the specified functor 'fn'.
+//
+// RETURNS: A pointer to the registered benchmark.
+internal::Benchmark* RegisterBenchmark(const char* name,
+                                       internal::Function* fn);
+
+#if defined(BENCHMARK_HAS_CXX11)
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+#endif
+
+// Remove all registered benchmarks. All pointers to previously registered
+// benchmarks are invalidated.
+void ClearRegisteredBenchmarks();
+
+namespace internal {
+// The class used to hold all Benchmarks created from static function.
+// (ie those created using the BENCHMARK(...) macros.
+class FunctionBenchmark : public Benchmark {
+ public:
+  FunctionBenchmark(const char* name, Function* func)
+      : Benchmark(name), func_(func) {}
+
+  virtual void Run(State& st);
+
+ private:
+  Function* func_;
+};
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+class LambdaBenchmark : public Benchmark {
+ public:
+  virtual void Run(State& st) { lambda_(st); }
+
+ private:
+  template <class OLambda>
+  LambdaBenchmark(const char* name, OLambda&& lam)
+      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
+
+  LambdaBenchmark(LambdaBenchmark const&) = delete;
+
+ private:
+  template <class Lam>
+  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+
+  Lambda lambda_;
+};
+#endif
+
+}  // namespace internal
+
+inline internal::Benchmark* RegisterBenchmark(const char* name,
+                                              internal::Function* fn) {
+  return internal::RegisterBenchmarkInternal(
+      ::new internal::FunctionBenchmark(name, fn));
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+  using BenchType =
+      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  return internal::RegisterBenchmarkInternal(
+      ::new BenchType(name, std::forward<Lambda>(fn)));
+}
+#endif
+
+#if defined(BENCHMARK_HAS_CXX11) && \
+    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
+template <class Lambda, class... Args>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+                                       Args&&... args) {
+  return benchmark::RegisterBenchmark(
+      name, [=](benchmark::State& st) { fn(st, args...); });
+}
+#else
+#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+#endif
+
+// The base class for all fixture tests.
+class Fixture : public internal::Benchmark {
+ public:
+  Fixture() : internal::Benchmark("") {}
+
+  virtual void Run(State& st) {
+    this->SetUp(st);
+    this->BenchmarkCase(st);
+    this->TearDown(st);
+  }
+
+  // These will be deprecated ...
+  virtual void SetUp(const State&) {}
+  virtual void TearDown(const State&) {}
+  // ... In favor of these.
+  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
+  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
+
+ protected:
+  virtual void BenchmarkCase(State&) = 0;
+};
+
+}  // namespace benchmark
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
+// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
+// empty. If X is empty the expression becomes (+1 == +0).
+#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
+#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
+#else
+#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
+#endif
+
+// Helpers for generating unique variable names
+#define BENCHMARK_PRIVATE_NAME(n) \
+  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
+#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+// Helper for concatenation with macro name expansion
+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
+    BaseClass##_##Method##_Benchmark
+
+#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
+  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
+      BENCHMARK_UNUSED
+
+#define BENCHMARK(n)                                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n, n)))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
+#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
+
+#ifdef BENCHMARK_HAS_CXX11
+
+// Register a benchmark which invokes the function specified by `func`
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args/int_string_test` */
+// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
+  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #func "/" #test_case_name,                 \
+              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+
+#endif  // BENCHMARK_HAS_CXX11
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE1(n, a)                        \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
+  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
+      (::benchmark::internal::RegisterBenchmarkInternal(                     \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
+                                                       n<a, b>)))
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE(n, ...)                       \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+#else
+#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
+#endif
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    virtual void BenchmarkCase(::benchmark::State&);          \
+  };
+
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
+   public:                                                          \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                \
+    }                                                               \
+                                                                    \
+   protected:                                                       \
+    virtual void BenchmarkCase(::benchmark::State&);                \
+  };
+
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
+   public:                                                             \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
+    }                                                                  \
+                                                                       \
+   protected:                                                          \
+    virtual void BenchmarkCase(::benchmark::State&);                   \
+  };
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
+   public:                                                                 \
+    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+      this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
+    }                                                                      \
+                                                                           \
+   protected:                                                              \
+    virtual void BenchmarkCase(::benchmark::State&);                       \
+  };
+#else
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
+#endif
+
+#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
+#endif
+
+#define BENCHMARK_REGISTER_F(BaseClass, Method) \
+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
+
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
+  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
+      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+
+// This macro will define and register a benchmark within a fixture class.
+#define BENCHMARK_F(BaseClass, Method)           \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);       \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                    \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                       \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                             \
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
+#endif
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                                                \
+  int main(int argc, char** argv) {                                     \
+    ::benchmark::Initialize(&argc, argv);                               \
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
+    ::benchmark::RunSpecifiedBenchmarks();                              \
+  }                                                                     \
+  int main(int, char**)
+
+// ------------------------------------------------------
+// Benchmark Reporters
+
+namespace benchmark {
+
+struct CPUInfo {
+  struct CacheInfo {
+    std::string type;
+    int level;
+    int size;
+    int num_sharing;
+  };
+
+  enum Scaling {
+    UNKNOWN,
+    ENABLED,
+    DISABLED
+  };
+
+  int num_cpus;
+  double cycles_per_second;
+  std::vector<CacheInfo> caches;
+  Scaling scaling;
+  std::vector<double> load_avg;
+
+  static const CPUInfo& Get();
+
+ private:
+  CPUInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo);
+};
+
+// Adding Struct for System Information
+struct SystemInfo {
+  std::string name;
+  static const SystemInfo& Get();
+
+ private:
+  SystemInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo);
+};
+
+// BenchmarkName contains the components of the Benchmark's name
+// which allows individual fields to be modified or cleared before
+// building the final name using 'str()'.
+struct BenchmarkName {
+  std::string function_name;
+  std::string args;
+  std::string min_time;
+  std::string iterations;
+  std::string repetitions;
+  std::string time_type;
+  std::string threads;
+
+  // Return the full name of the benchmark with each non-empty
+  // field separated by a '/'
+  std::string str() const;
+};
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    CPUInfo const& cpu_info;
+    SystemInfo const& sys_info;
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+    static const char* executable_name;
+    Context();
+  };
+
+  struct Run {
+    static const int64_t no_repetition_index = -1;
+    enum RunType { RT_Iteration, RT_Aggregate };
+
+    Run()
+        : run_type(RT_Iteration),
+          error_occurred(false),
+          iterations(1),
+          threads(1),
+          time_unit(kNanosecond),
+          real_accumulated_time(0),
+          cpu_accumulated_time(0),
+          max_heapbytes_used(0),
+          complexity(oNone),
+          complexity_lambda(),
+          complexity_n(0),
+          report_big_o(false),
+          report_rms(false),
+          counters(),
+          has_memory_result(false),
+          allocs_per_iter(0.0),
+          max_bytes_used(0) {}
+
+    std::string benchmark_name() const;
+    BenchmarkName run_name;
+    RunType run_type;
+    std::string aggregate_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    bool error_occurred;
+    std::string error_message;
+
+    IterationCount iterations;
+    int64_t threads;
+    int64_t repetition_index;
+    int64_t repetitions;
+    TimeUnit time_unit;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Return a value representing the real time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedRealTime() const;
+
+    // Return a value representing the cpu time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedCPUTime() const;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+
+    // Keep track of arguments to compute asymptotic complexity
+    BigO complexity;
+    BigOFunc* complexity_lambda;
+    int64_t complexity_n;
+
+    // what statistics to compute from the measurements
+    const std::vector<internal::Statistics>* statistics;
+
+    // Inform print function whether the current run is a complexity report
+    bool report_big_o;
+    bool report_rms;
+
+    UserCounters counters;
+
+    // Memory metrics.
+    bool has_memory_result;
+    double allocs_per_iter;
+    int64_t max_bytes_used;
+  };
+
+  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
+  // and the error stream set to 'std::cerr'
+  BenchmarkReporter();
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run. If the group
+  // of runs contained more than two entries then 'report' contains additional
+  // elements representing the mean and standard deviation of those runs.
+  // Additionally if this group of runs was the last in a family of benchmarks
+  // 'reports' contains additional entries representing the asymptotic
+  // complexity and RMS of that benchmark family.
+  virtual void ReportRuns(const std::vector<Run>& report) = 0;
+
+  // Called once and only once after ever group of benchmarks is run and
+  // reported.
+  virtual void Finalize() {}
+
+  // REQUIRES: The object referenced by 'out' is valid for the lifetime
+  // of the reporter.
+  void SetOutputStream(std::ostream* out) {
+    assert(out);
+    output_stream_ = out;
+  }
+
+  // REQUIRES: The object referenced by 'err' is valid for the lifetime
+  // of the reporter.
+  void SetErrorStream(std::ostream* err) {
+    assert(err);
+    error_stream_ = err;
+  }
+
+  std::ostream& GetOutputStream() const { return *output_stream_; }
+
+  std::ostream& GetErrorStream() const { return *error_stream_; }
+
+  virtual ~BenchmarkReporter();
+
+  // Write a human readable string to 'out' representing the specified
+  // 'context'.
+  // REQUIRES: 'out' is non-null.
+  static void PrintBasicContext(std::ostream* out, Context const& context);
+
+ private:
+  std::ostream* output_stream_;
+  std::ostream* error_stream_;
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  enum OutputOptions {
+    OO_None = 0,
+    OO_Color = 1,
+    OO_Tabular = 2,
+    OO_ColorTabular = OO_Color | OO_Tabular,
+    OO_Defaults = OO_ColorTabular
+  };
+  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
+      : output_options_(opts_),
+        name_field_width_(0),
+        prev_counters_(),
+        printed_header_(false) {}
+
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ protected:
+  virtual void PrintRunData(const Run& report);
+  virtual void PrintHeader(const Run& report);
+
+  OutputOptions output_options_;
+  size_t name_field_width_;
+  UserCounters prev_counters_;
+  bool printed_header_;
+};
+
+class JSONReporter : public BenchmarkReporter {
+ public:
+  JSONReporter() : first_report_(true) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual void Finalize();
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool first_report_;
+};
+
+class BENCHMARK_DEPRECATED_MSG(
+    "The CSV Reporter will be removed in a future release") CSVReporter
+    : public BenchmarkReporter {
+ public:
+  CSVReporter() : printed_header_(false) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool printed_header_;
+  std::set<std::string> user_counter_names_;
+};
+
+// If a MemoryManager is registered, it can be used to collect and report
+// allocation metrics for a run of the benchmark.
+class MemoryManager {
+ public:
+  struct Result {
+    Result() : num_allocs(0), max_bytes_used(0) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result* result) = 0;
+};
+
+inline const char* GetTimeUnitString(TimeUnit unit) {
+  switch (unit) {
+    case kSecond:
+      return "s";
+    case kMillisecond:
+      return "ms";
+    case kMicrosecond:
+      return "us";
+    case kNanosecond:
+      return "ns";
+  }
+  BENCHMARK_UNREACHABLE();
+}
+
+inline double GetTimeUnitMultiplier(TimeUnit unit) {
+  switch (unit) {
+    case kSecond:
+      return 1;
+    case kMillisecond:
+      return 1e3;
+    case kMicrosecond:
+      return 1e6;
+    case kNanosecond:
+      return 1e9;
+  }
+  BENCHMARK_UNREACHABLE();
+}
+
+}  // namespace benchmark
 
-#endif // BENCHMARK_BENCHMARK_H_
+#endif  // BENCHMARK_BENCHMARK_H_
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
deleted file mode 100644
index 664ca2a..0000000
--- a/include/benchmark/benchmark_api.h
+++ /dev/null
@@ -1,747 +0,0 @@
-// Support for registering benchmarks for functions.
-
-/* Example usage:
-// Define a function that executes the code to be measured a
-// specified number of times:
-static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
-    std::string empty_string;
-}
-
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  while (state.KeepRunning())
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-// Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
-//       my_unittest --benchmark_filter=all
-//       my_unittest --benchmark_filter=BM_StringCreation
-//       my_unittest --benchmark_filter=String
-//       my_unittest --benchmark_filter='Copy|Creation'
-int main(int argc, char** argv) {
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  return 0;
-}
-
-// Sometimes a family of microbenchmarks can be implemented with
-// just one routine that takes an extra argument to specify which
-// one of the family of benchmarks to run.  For example, the following
-// code defines a family of microbenchmarks for measuring the speed
-// of memcpy() calls of different lengths:
-
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
-  memset(src, 'x', state.range_x());
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range_x());
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range_x()));
-  delete[] src; delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-
-// The preceding code is quite repetitive, and can be replaced with the
-// following short-hand.  The following invocation will pick a few
-// appropriate arguments in the specified range and will generate a
-// microbenchmark for each such argument.
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-
-// You might have a microbenchmark that depends on two inputs.  For
-// example, the following code defines a family of microbenchmarks for
-// measuring the speed of set insertion.
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range_x());
-    state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-   ->ArgPair(1<<10, 1)
-   ->ArgPair(1<<10, 8)
-   ->ArgPair(1<<10, 64)
-   ->ArgPair(1<<10, 512)
-   ->ArgPair(8<<10, 1)
-   ->ArgPair(8<<10, 8)
-   ->ArgPair(8<<10, 64)
-   ->ArgPair(8<<10, 512);
-
-// The preceding code is quite repetitive, and can be replaced with
-// the following short-hand.  The following macro will pick a few
-// appropriate arguments in the product of the two specified ranges
-// and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
-
-// For more complex patterns of inputs, passing a custom function
-// to Apply allows programmatic specification of an
-// arbitrary set of arguments to run the microbenchmark on.
-// The following example enumerates a dense range on
-// one parameter, and a sparse range on the second.
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->ArgPair(i, j);
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-
-// Templated microbenchmarks work the same way:
-// Produce then consume 'size' messages 'iters' times
-// Measures throughput in the absence of multiprogramming.
-template <class Q> int BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range_x(); i--; )
-      q.push(v);
-    for (int e = state.range_x(); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range_x());
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-
-Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
-benchmark. This option overrides the `benchmark_min_time` flag.
-
-void BM_test(benchmark::State& state) {
- ... body ...
-}
-BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
-
-In a multithreaded test, it is guaranteed that none of the threads will start
-until all have called KeepRunning, and all will have finished before KeepRunning
-returns false. As such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
-
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  while (state.KeepRunning()) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(4);
-
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-*/
-
-#ifndef BENCHMARK_BENCHMARK_API_H_
-#define BENCHMARK_BENCHMARK_API_H_
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "macros.h"
-
-namespace benchmark {
-class BenchmarkReporter;
-
-void Initialize(int* argc, char** argv);
-
-// Generate a list of benchmarks matching the specified --benchmark_filter flag
-// and if --benchmark_list_tests is specified return after printing the name
-// of each matching benchmark. Otherwise run each matching benchmark and
-// report the results.
-//
-// The second overload reports the results using the specified 'reporter'.
-//
-// RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter);
-
-
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
-
-namespace internal {
-class Benchmark;
-class BenchmarkImp;
-class BenchmarkFamilies;
-
-template <class T> struct Voider {
-    typedef void type;
-};
-
-template <class T, class = void>
-struct EnableIfString {};
-
-template <class T>
-struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-    typedef int type;
-};
-
-void UseCharPointer(char const volatile*);
-
-// Take ownership of the pointer and register the benchmark. Return the
-// registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
-
-} // end namespace internal
-
-
-// The DoNotOptimize(...) function can be used to prevent a value or
-// expression from being optimized away by the compiler. This function is
-// intended to add little to no overhead.
-// See: https://youtu.be/nXaxk27zwlk?t=2441
-#if defined(__GNUC__)
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    asm volatile("" : : "g"(value) : "memory");
-}
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-    asm volatile("" : : : "memory");
-}
-#else
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-    internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-}
-// FIXME Add ClobberMemory() for non-gnu compilers
-#endif
-
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit {
-  kNanosecond,
-  kMicrosecond,
-  kMillisecond
-};
-
-// BigO is passed to a benchmark in order to specify the asymptotic computational 
-// complexity for the benchmark. In case oAuto is selected, complexity will be 
-// calculated automatically to the best fit.
-enum BigO {
-  oNone,
-  o1,
-  oN,
-  oNSquared,
-  oNCubed,
-  oLogN,
-  oNLogN,
-  oAuto,
-  oLambda
-};
-
-// BigOFunc is passed to a benchmark in order to specify the asymptotic 
-// computational complexity for the benchmark.
-typedef double(BigOFunc)(int);
-
-// State is passed to a running Benchmark and contains state for the
-// benchmark to use.
-class State {
-public:
-  State(size_t max_iters, bool has_x, int x, bool has_y, int y,
-        int thread_i, int n_threads);
-
-  // Returns true if the benchmark should continue through another iteration.
-  // NOTE: A benchmark may not return from the test until KeepRunning() has
-  // returned false.
-  bool KeepRunning() {
-    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      assert(!finished_);
-      started_ = true;
-      ResumeTiming();
-    }
-    bool const res = total_iterations_++ < max_iterations;
-    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      assert(started_ && (!finished_ || error_occurred_));
-      if (!error_occurred_) {
-        PauseTiming();
-      }
-      // Total iterations now is one greater than max iterations. Fix this.
-      total_iterations_ = max_iterations;
-      finished_ = true;
-    }
-    return res;
-  }
-
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Stop the benchmark timer.  If not called, the timer will be
-  // automatically stopped after KeepRunning() returns false for the first time.
-  //
-  // For threaded benchmarks the PauseTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will stop when the last thread has called this function.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void PauseTiming();
-
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Start the benchmark timer.  The timer is NOT running on entrance to the
-  // benchmark function. It begins running after the first call to KeepRunning()
-  //
-  // For threaded benchmarks the ResumeTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will start when the last thread has called this function.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void ResumeTiming();
-
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
-  // Skip any future iterations of the 'KeepRunning()' loop in the current
-  // thread and report an error with the specified 'msg'. After this call
-  // the user may explicitly 'return' from the benchmark.
-  //
-  // For threaded benchmarks only the current thread stops executing. If
-  // multiple threads report an error only the first error message is used.
-  // The current thread is no longer considered 'active' by
-  // 'PauseTiming()' and 'ResumingTiming()'.
-  //
-  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
-  // the current scope immediately. If the function is called from within
-  // the 'KeepRunning()' loop the current iteration will finish. It is the users
-  // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
-
-  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
-  // Set the manually measured time for this benchmark iteration, which
-  // is used instead of automatically measured time if UseManualTime() was
-  // specified.
-  //
-  // For threaded benchmarks the SetIterationTime() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
-  // The time will be set by the last thread to call this function.
-  void SetIterationTime(double seconds);
-
-  // Set the number of bytes processed by the current benchmark
-  // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(size_t bytes) {
-    bytes_processed_ = bytes;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t bytes_processed() const {
-    return bytes_processed_;
-  }
-
-  // If this routine is called with complexity_n > 0 and complexity report is requested for the 
-  // family benchmark, then current benchmark will be part of the computation and complexity_n will
-  // represent the length of N.
-  BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int complexity_n) {
-    complexity_n_ = complexity_n;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t complexity_length_n() {
-    return complexity_n_;
-  }
-
-  // If this routine is called with items > 0, then an items/s
-  // label is printed on the benchmark report line for the currently
-  // executing benchmark. It is typically called at the end of a processing
-  // benchmark where a processing items/second output is desired.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(size_t items) {
-    items_processed_ = items;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t items_processed() const {
-    return items_processed_;
-  }
-
-  // If this routine is called, the specified label is printed at the
-  // end of the benchmark report line for the currently executing
-  // benchmark.  Example:
-  //  static void BM_Compress(benchmark::State& state) {
-  //    ...
-  //    double compress = input_size / output_size;
-  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
-  //  }
-  // Produces output that looks like:
-  //  BM_Compress   50         50   14115038  compress:27.3%
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const char* label);
-
-  // Allow the use of std::string without actually including <string>.
-  // This function does not participate in overload resolution unless StringType
-  // has the nested typename `basic_string`. This typename should be provided
-  // as an injected class name in the case of std::string.
-  template <class StringType>
-  void SetLabel(StringType const & str,
-                typename internal::EnableIfString<StringType>::type = 1) {
-    this->SetLabel(str.c_str());
-  }
-
-  // Range arguments for this run. CHECKs if the argument has been set.
-  BENCHMARK_ALWAYS_INLINE
-  int range_x() const {
-    assert(has_range_x_);
-    ((void)has_range_x_); // Prevent unused warning.
-    return range_x_;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  int range_y() const {
-    assert(has_range_y_);
-    ((void)has_range_y_); // Prevent unused warning.
-    return range_y_;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const { return total_iterations_; }
-
-private:
-  bool started_;
-  bool finished_;
-  size_t total_iterations_;
-
-  bool has_range_x_;
-  int range_x_;
-
-  bool has_range_y_;
-  int range_y_;
-
-  size_t bytes_processed_;
-  size_t items_processed_;
-
-  int complexity_n_;
-
-public:
-  // FIXME: Make this private somehow.
-  bool error_occurred_;
-public:
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
-  const size_t max_iterations;
-
-private:
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
-};
-
-namespace internal {
-
-typedef void(Function)(State&);
-
-// ------------------------------------------------------
-// Benchmark registration object.  The BENCHMARK() macro expands
-// into an internal::Benchmark* object.  Various methods can
-// be called on this object to change the properties of the benchmark.
-// Each method returns "this" so that multiple method calls can
-// chained into one expression.
-class Benchmark {
-public:
-  virtual ~Benchmark();
-
-  // Note: the following methods all return "this" so that multiple
-  // method calls can be chained together in one expression.
-
-  // Run this benchmark once with "x" as the extra argument passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Arg(int x);
-
-  // Run this benchmark with the given time unit for the generated output report
-  Benchmark* Unit(TimeUnit unit);
-
-  // Run this benchmark once for a number of values picked from the
-  // range [start..limit].  (start and limit are always picked.)
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Range(int start, int limit);
-
-  // Run this benchmark once for every value in the range [start..limit]
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* DenseRange(int start, int limit);
-
-  // Run this benchmark once with "x,y" as the extra arguments passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* ArgPair(int x, int y);
-
-  // Pick a set of values A from the range [lo1..hi1] and a set
-  // of values B from the range [lo2..hi2].  Run the benchmark for
-  // every pair of values in the cartesian product of A and B
-  // (i.e., for all combinations of the values in A and B).
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
-
-  // Pass this benchmark object to *func, which can customize
-  // the benchmark by calling various methods like Arg, ArgPair,
-  // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
-
-  // Set the range multiplier for non-dense range. If not called, the range multiplier 
-  // kRangeMultiplier will be used.
-  Benchmark* RangeMultiplier(int multiplier);
-
-  // Set the minimum amount of time to use when running this benchmark. This
-  // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0`
-  Benchmark* MinTime(double t);
-
-  // Specify the amount of times to repeat this benchmark. This option overrides
-  // the `benchmark_repetitions` flag.
-  // REQUIRES: `n > 0`
-  Benchmark* Repetitions(int n);
-
-  // If a particular benchmark is I/O bound, runs multiple threads internally or
-  // if for some reason CPU timings are not representative, call this method. If
-  // called, the elapsed time will be used to control how many iterations are
-  // run, and in the printing of items/second or MB/seconds values.  If not
-  // called, the cpu time used by the benchmark will be used.
-  Benchmark* UseRealTime();
-
-  // If a benchmark must measure time manually (e.g. if GPU execution time is being
-  // measured), call this method. If called, each benchmark iteration should call
-  // SetIterationTime(seconds) to report the measured time, which will be used
-  // to control how many iterations are run, and in the printing of items/second
-  // or MB/second values.
-  Benchmark* UseManualTime();
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output. 
-  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output.
-  Benchmark* Complexity(BigOFunc* complexity);
-
-  // Support for running multiple copies of the same benchmark concurrently
-  // in multiple threads.  This may be useful when measuring the scaling
-  // of some piece of code.
-
-  // Run one instance of this benchmark concurrently in t threads.
-  Benchmark* Threads(int t);
-
-  // Pick a set of values T from [min_threads,max_threads].
-  // min_threads and max_threads are always included in T.  Run this
-  // benchmark once for each value in T.  The benchmark run for a
-  // particular value t consists of t threads running the benchmark
-  // function concurrently.  For example, consider:
-  //    BENCHMARK(Foo)->ThreadRange(1,16);
-  // This will run the following benchmarks:
-  //    Foo in 1 thread
-  //    Foo in 2 threads
-  //    Foo in 4 threads
-  //    Foo in 8 threads
-  //    Foo in 16 threads
-  Benchmark* ThreadRange(int min_threads, int max_threads);
-
-  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
-  Benchmark* ThreadPerCpu();
-
-  virtual void Run(State& state) = 0;
-
-  // Used inside the benchmark implementation
-  struct Instance;
-
-protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
-
-private:
-  friend class BenchmarkFamilies;
-  BenchmarkImp* imp_;
-
-  Benchmark& operator=(Benchmark const&);
-};
-
-// The class used to hold all Benchmarks created from static function.
-// (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
-public:
-    FunctionBenchmark(const char* name, Function* func)
-        : Benchmark(name), func_(func)
-    {}
-
-    virtual void Run(State& st);
-private:
-    Function* func_;
-};
-
-}  // end namespace internal
-
-// The base class for all fixture tests.
-class Fixture: public internal::Benchmark {
-public:
-    Fixture() : internal::Benchmark("") {}
-
-    virtual void Run(State& st) {
-      this->SetUp(st);
-      this->BenchmarkCase(st);
-      this->TearDown(st);
-    }
-
-    virtual void SetUp(const State&) {}
-    virtual void TearDown(const State&) {}
-
-protected:
-    virtual void BenchmarkCase(State&) = 0;
-};
-
-}  // end namespace benchmark
-
-
-// ------------------------------------------------------
-// Macro to register benchmarks
-
-// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
-// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
-// empty. If X is empty the expression becomes (+1 == +0).
-#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
-#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
-#else
-#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
-#endif
-
-// Helpers for generating unique variable names
-#define BENCHMARK_PRIVATE_NAME(n) \
-    BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
-#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
-
-#define BENCHMARK_PRIVATE_DECLARE(n)       \
-  static ::benchmark::internal::Benchmark* \
-  BENCHMARK_PRIVATE_NAME(n) BENCHMARK_UNUSED
-
-#define BENCHMARK(n) \
-    BENCHMARK_PRIVATE_DECLARE(n) =                               \
-        (::benchmark::internal::RegisterBenchmarkInternal(       \
-            new ::benchmark::internal::FunctionBenchmark(#n, n)))
-
-// Old-style macros
-#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
-#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
-#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
-#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
-#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
-  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
-
-#if __cplusplus >= 201103L
-
-// Register a benchmark which invokes the function specified by `func`
-// with the additional arguments specified by `...`.
-//
-// For example:
-//
-// template <class ...ExtraArgs>`
-// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-//  [...]
-//}
-// /* Registers a benchmark named "BM_takes_args/int_string_test` */
-// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-#define BENCHMARK_CAPTURE(func, test_case_name, ...)                       \
-    BENCHMARK_PRIVATE_DECLARE(func) =                                      \
-        (::benchmark::internal::RegisterBenchmarkInternal(                 \
-            new ::benchmark::internal::FunctionBenchmark(                  \
-                    #func "/" #test_case_name,                             \
-                    [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
-
-#endif // __cplusplus >= 11
-
-// This will register a benchmark for a templatized function.  For example:
-//
-// template<int arg>
-// void BM_Foo(int iters);
-//
-// BENCHMARK_TEMPLATE(BM_Foo, 1);
-//
-// will register BM_Foo<1> as a benchmark.
-#define BENCHMARK_TEMPLATE1(n, a) \
-  BENCHMARK_PRIVATE_DECLARE(n) =  \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
-
-#define BENCHMARK_TEMPLATE2(n, a, b)                     \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark(    \
-            #n "<" #a "," #b ">", n<a, b>)))
-
-#if __cplusplus >= 201103L
-#define BENCHMARK_TEMPLATE(n, ...)           \
-  BENCHMARK_PRIVATE_DECLARE(n) =             \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-        new ::benchmark::internal::FunctionBenchmark( \
-        #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
-#else
-#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
-#endif
-
-
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)      \
-class BaseClass##_##Method##_Benchmark : public BaseClass { \
-public:                                                     \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {      \
-        this->SetName(#BaseClass "/" #Method);}             \
-protected:                                                  \
-    virtual void BenchmarkCase(::benchmark::State&);        \
-};
-
-#define BENCHMARK_DEFINE_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-    void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-#define BENCHMARK_REGISTER_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
-
-#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-    BENCHMARK_PRIVATE_DECLARE(TestName) = \
-        (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
-
-// This macro will define and register a benchmark within a fixture class.
-#define BENCHMARK_F(BaseClass, Method) \
-    BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-    BENCHMARK_REGISTER_F(BaseClass, Method); \
-    void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-
-// Helper macro to create a main routine in a test that runs the benchmarks
-#define BENCHMARK_MAIN()                   \
-  int main(int argc, char** argv) {        \
-    ::benchmark::Initialize(&argc, argv);  \
-    ::benchmark::RunSpecifiedBenchmarks(); \
-  }
-
-#endif  // BENCHMARK_BENCHMARK_API_H_
diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h
deleted file mode 100644
index 09d13c1..0000000
--- a/include/benchmark/macros.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_MACROS_H_
-#define BENCHMARK_MACROS_H_
-
-#if __cplusplus < 201103L
-# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
-    TypeName(const TypeName&);                         \
-    TypeName& operator=(const TypeName&)
-#else
-# define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName)  \
-    TypeName(const TypeName&) = delete;                \
-    TypeName& operator=(const TypeName&) = delete
-#endif
-
-#if defined(__GNUC__)
-# define BENCHMARK_UNUSED __attribute__((unused))
-# define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-# define BENCHMARK_NOEXCEPT noexcept
-# define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
-# define BENCHMARK_UNUSED
-# define BENCHMARK_ALWAYS_INLINE __forceinline
-# if _MSC_VER >= 1900
-#  define BENCHMARK_NOEXCEPT noexcept
-#  define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-# else
-#  define BENCHMARK_NOEXCEPT
-#  define BENCHMARK_NOEXCEPT_OP(x)
-# endif
-# define __func__ __FUNCTION__
-#else
-# define BENCHMARK_UNUSED
-# define BENCHMARK_ALWAYS_INLINE
-# define BENCHMARK_NOEXCEPT
-# define BENCHMARK_NOEXCEPT_OP(x)
-#endif
-
-#if defined(__GNUC__)
-# define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
-#else
-# define BENCHMARK_BUILTIN_EXPECT(x, y) x
-#endif
-
-#endif  // BENCHMARK_MACROS_H_
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
deleted file mode 100644
index 22c97a0..0000000
--- a/include/benchmark/reporter.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_REPORTER_H_
-#define BENCHMARK_REPORTER_H_
-
-#include <cassert>
-#include <iosfwd>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "benchmark_api.h"  // For forward declaration of BenchmarkReporter
-
-namespace benchmark {
-
-// Interface for custom benchmark result printers.
-// By default, benchmark reports are printed to stdout. However an application
-// can control the destination of the reports by calling
-// RunSpecifiedBenchmarks and passing it a custom reporter object.
-// The reporter object must implement the following interface.
-class BenchmarkReporter {
- public:
-  struct Context {
-    int num_cpus;
-    double mhz_per_cpu;
-    bool cpu_scaling_enabled;
-
-    // The number of chars in the longest benchmark name.
-    size_t name_field_width;
-  };
-
-  struct Run {
-    Run() :
-      error_occurred(false),
-      iterations(1),
-      time_unit(kNanosecond),
-      real_accumulated_time(0),
-      cpu_accumulated_time(0),
-      bytes_per_second(0),
-      items_per_second(0),
-      max_heapbytes_used(0),
-      complexity(oNone),
-      complexity_n(0),
-      report_big_o(false),
-      report_rms(false) {}
-
-    std::string benchmark_name;
-    std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
-
-    int64_t iterations;
-    TimeUnit time_unit;
-    double real_accumulated_time;
-    double cpu_accumulated_time;
-
-    // Return a value representing the real time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedRealTime() const;
-
-    // Return a value representing the cpu time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedCPUTime() const;
-
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
-    // This is set to 0.0 if memory tracing is not enabled.
-    double max_heapbytes_used;
-
-    // Keep track of arguments to compute asymptotic complexity
-    BigO complexity;
-    BigOFunc* complexity_lambda;
-    int complexity_n;
-
-    // Inform print function whether the current run is a complexity report
-    bool report_big_o;
-    bool report_rms;
-  };
-
-  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
-  // and the error stream set to 'std::cerr'
-  BenchmarkReporter();
-
-  // Called once for every suite of benchmarks run.
-  // The parameter "context" contains information that the
-  // reporter may wish to use when generating its report, for example the
-  // platform under which the benchmarks are running. The benchmark run is
-  // never started if this function returns false, allowing the reporter
-  // to skip runs based on the context information.
-  virtual bool ReportContext(const Context& context) = 0;
-
-  // Called once for each group of benchmark runs, gives information about
-  // cpu-time and heap memory usage during the benchmark run. If the group
-  // of runs contained more than two entries then 'report' contains additional
-  // elements representing the mean and standard deviation of those runs.
-  // Additionally if this group of runs was the last in a family of benchmarks
-  // 'reports' contains additional entries representing the asymptotic
-  // complexity and RMS of that benchmark family.
-  virtual void ReportRuns(const std::vector<Run>& report) = 0;
-
-  // Called once and only once after ever group of benchmarks is run and
-  // reported.
-  virtual void Finalize() {}
-
-  // REQUIRES: The object referenced by 'out' is valid for the lifetime
-  // of the reporter.
-  void SetOutputStream(std::ostream* out) {
-    assert(out);
-    output_stream_ = out;
-  }
-
-  // REQUIRES: The object referenced by 'err' is valid for the lifetime
-  // of the reporter.
-  void SetErrorStream(std::ostream* err) {
-    assert(err);
-    error_stream_ = err;
-  }
-
-  std::ostream& GetOutputStream() const {
-    return *output_stream_;
-  }
-
-  std::ostream& GetErrorStream() const {
-    return *error_stream_;
-  }
-
-  virtual ~BenchmarkReporter();
-
-  // Write a human readable string to 'out' representing the specified
-  // 'context'.
-  // REQUIRES: 'out' is non-null.
-  static void PrintBasicContext(std::ostream* out, Context const& context);
-
- private:
-  std::ostream* output_stream_;
-  std::ostream* error_stream_;
-};
-
-// Simple reporter that outputs benchmark data to the console. This is the
-// default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
- public:
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- protected:
-  virtual void PrintRunData(const Run& report);
-
-  size_t name_field_width_;
-};
-
-class JSONReporter : public BenchmarkReporter {
- public:
-  JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
-
- private:
-  void PrintRunData(const Run& report);
-
-  bool first_report_;
-};
-
-class CSVReporter : public BenchmarkReporter {
- public:
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- private:
-  void PrintRunData(const Run& report);
-};
-
-inline const char* GetTimeUnitString(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return "ms";
-    case kMicrosecond:
-      return "us";
-    case kNanosecond:
-    default:
-      return "ns";
-  }
-}
-
-inline double GetTimeUnitMultiplier(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return 1e3;
-    case kMicrosecond:
-      return 1e6;
-    case kNanosecond:
-    default:
-      return 1e9;
-  }
-}
-
-}  // end namespace benchmark
-#endif  // BENCHMARK_REPORTER_H_
diff --git a/mingw.py b/mingw.py
deleted file mode 100644
index 706ad55..0000000
--- a/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..85e8986
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy == 1.19.4
+scipy == 1.5.4
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5cdab10
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,140 @@
+import os
+import posixpath
+import re
+import shutil
+import sys
+
+from distutils import sysconfig
+import setuptools
+from setuptools.command import build_ext
+
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+
+IS_WINDOWS = sys.platform.startswith("win")
+
+
+def _get_version():
+    """Parse the version string from __init__.py."""
+    with open(
+        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
+    ) as init_file:
+        try:
+            version_line = next(
+                line for line in init_file if line.startswith("__version__")
+            )
+        except StopIteration:
+            raise ValueError("__version__ not defined in __init__.py")
+        else:
+            namespace = {}
+            exec(version_line, namespace)  # pylint: disable=exec-used
+            return namespace["__version__"]
+
+
+def _parse_requirements(path):
+    with open(os.path.join(HERE, path)) as requirements:
+        return [
+            line.rstrip()
+            for line in requirements
+            if not (line.isspace() or line.startswith("#"))
+        ]
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name, bazel_target):
+        self.bazel_target = bazel_target
+        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
+            ":"
+        )
+        setuptools.Extension.__init__(self, name, sources=[])
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        build_ext.build_ext.run(self)
+
+    def bazel_build(self, ext):
+        """Runs the bazel build to create the package."""
+        with open("WORKSPACE", "r") as workspace:
+            workspace_contents = workspace.read()
+
+        with open("WORKSPACE", "w") as workspace:
+            workspace.write(
+                re.sub(
+                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
+                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
+                    workspace_contents,
+                )
+            )
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        bazel_argv = [
+            "bazel",
+            "build",
+            ext.bazel_target,
+            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
+            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
+        ]
+
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+
+        self.spawn(bazel_argv)
+
+        shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+        ext_bazel_bin_path = os.path.join(
+            self.build_temp, 'bazel-bin',
+            ext.relpath, ext.target_name + shared_lib_suffix)
+
+        ext_dest_path = self.get_ext_fullpath(ext.name)
+        ext_dest_dir = os.path.dirname(ext_dest_path)
+        if not os.path.exists(ext_dest_dir):
+            os.makedirs(ext_dest_dir)
+        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+
+setuptools.setup(
+    name="google_benchmark",
+    version=_get_version(),
+    url="https://github.com/google/benchmark",
+    description="A library to benchmark code snippets.",
+    author="Google",
+    author_email="benchmark-py@google.com",
+    # Contained modules and scripts.
+    package_dir={"": "bindings/python"},
+    packages=setuptools.find_packages("bindings/python"),
+    install_requires=_parse_requirements("bindings/python/requirements.txt"),
+    cmdclass=dict(build_ext=BuildBazelExtension),
+    ext_modules=[
+        BazelExtension(
+            "google_benchmark._benchmark",
+            "//bindings/python/google_benchmark:_benchmark",
+        )
+    ],
+    zip_safe=False,
+    # PyPI package information.
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Software Development :: Testing",
+        "Topic :: System :: Benchmark",
+    ],
+    license="Apache 2.0",
+    keywords="benchmark",
+)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6dab64b..35d559e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,51 +1,114 @@
 # Allow the source files to find headers in src/
+include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
-# Define the source files
-set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
-                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
-                 "sysinfo.cc" "walltime.cc" "complexity.cc")
-# Determine the correct regular expression engine to use
-if(HAVE_STD_REGEX)
-  set(RE_FILES "re_std.cc")
-elseif(HAVE_GNU_POSIX_REGEX)
-  set(RE_FILES "re_posix.cc")
-elseif(HAVE_POSIX_REGEX)
-  set(RE_FILES "re_posix.cc")
-else()
-  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
 endif()
 
-add_library(benchmark ${SOURCE_FILES} ${RE_FILES})
-
+file(GLOB
+  SOURCE_FILES
+    *.cc
+    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+file(GLOB BENCHMARK_MAIN "benchmark_main.cc")
+foreach(item ${BENCHMARK_MAIN})
+  list(REMOVE_ITEM SOURCE_FILES "${item}")
+endforeach()
 
+add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
 set_target_properties(benchmark PROPERTIES
   OUTPUT_NAME "benchmark"
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
 
 # Link threads.
-target_link_libraries(benchmark ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+find_library(LIBRT rt)
+if(LIBRT)
+  target_link_libraries(benchmark ${LIBRT})
+endif()
+
+if(CMAKE_BUILD_TYPE)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
+endif()
+if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
+  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
+  target_link_libraries(benchmark -pthread)
+endif()
 
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark Shlwapi)
+  target_link_libraries(benchmark shlwapi)
+endif()
+
+# We need extra libraries on Solaris
+if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
+  target_link_libraries(benchmark kstat)
 endif()
 
-# Expose public API
-target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
-
-# Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
-install(
-  TARGETS benchmark
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION bin
-  COMPONENT library)
-
-install(
-  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION include
-  FILES_MATCHING PATTERN "*.*h")
+# Benchmark main library
+add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
+set_target_properties(benchmark_main PROPERTIES
+  OUTPUT_NAME "benchmark_main"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
+target_link_libraries(benchmark_main benchmark::benchmark)
+
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+
+if (BENCHMARK_ENABLE_INSTALL)
+  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+  install(
+    TARGETS benchmark benchmark_main
+    EXPORT ${targets_export_name}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  install(
+    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES_MATCHING PATTERN "*.*h")
+
+  install(
+      FILES "${project_config}" "${version_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+
+  install(
+      FILES "${pkg_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+  install(
+      EXPORT "${targets_export_name}"
+      NAMESPACE "${namespace}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
diff --git a/src/arraysize.h b/src/arraysize.h
index 638a52a..51a50f2 100644
--- a/src/arraysize.h
+++ b/src/arraysize.h
@@ -11,7 +11,6 @@ namespace internal {
 // a pointer by mistake, you will get a compile-time error.
 //
 
-
 // This template function declaration is used in defining arraysize.
 // Note that the function doesn't need an implementation, as we only
 // use its type.
@@ -28,7 +27,7 @@ char (&ArraySizeHelper(const T (&array)[N]))[N];
 
 #define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
 
-#endif // BENCHMARK_ARRAYSIZE_H_
+#endif  // BENCHMARK_ARRAYSIZE_H_
diff --git a/src/benchmark.cc b/src/benchmark.cc
index cb8e132..1c049f2 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -13,1045 +13,404 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "benchmark_runner.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#include <sys/time.h>
+#ifndef BENCHMARK_OS_FUCHSIA
 #include <sys/resource.h>
+#endif
+#include <sys/time.h>
 #include <unistd.h>
 #endif
 
-#include <cstdlib>
-#include <cstring>
-#include <cstdio>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
 #include <iostream>
 #include <memory>
+#include <string>
 #include <thread>
+#include <utility>
 
 #include "check.h"
+#include "colorprint.h"
 #include "commandlineflags.h"
 #include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
-#include "stat.h"
+#include "statistics.h"
 #include "string_util.h"
-#include "sysinfo.h"
-#include "walltime.h"
-
-DEFINE_bool(benchmark_list_tests, false,
-            "Print a list of benchmarks. This option overrides all other "
-            "options.");
-
-DEFINE_string(benchmark_filter, ".",
-              "A regular expression that specifies the set of benchmarks "
-              "to execute.  If this flag is empty, no benchmarks are run.  "
-              "If this flag is the string \"all\", all benchmarks linked "
-              "into the process are run.");
-
-DEFINE_double(benchmark_min_time, 0.5,
-              "Minimum number of seconds we should run benchmark before "
-              "results are considered significant.  For cpu-time based "
-              "tests, this is the lower bound on the total cpu time "
-              "used by all threads that make up the test.  For real-time "
-              "based tests, this is the lower bound on the elapsed time "
-              "of the benchmark execution, regardless of number of "
-              "threads.");
-
-DEFINE_int32(benchmark_repetitions, 1,
-             "The number of runs of each benchmark. If greater than 1, the "
-             "mean and standard deviation of the runs will be reported.");
-
-DEFINE_string(benchmark_format, "console",
-              "The format to use for console output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_bool(color_print, true, "Enables colorized logging.");
-
-DEFINE_int32(v, 0, "The level of verbose logging to output");
-
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+// Print a list of benchmarks. This option overrides all other options.
+DEFINE_bool(benchmark_list_tests, false);
+
+// A regular expression that specifies the set of benchmarks to execute.  If
+// this flag is empty, or if this flag is the string \"all\", all benchmarks
+// linked into the binary are run.
+DEFINE_string(benchmark_filter, ".");
+
+// Minimum number of seconds we should run benchmark before results are
+// considered significant.  For cpu-time based tests, this is the lower bound
+// on the total cpu time used by all threads that make up the test.  For
+// real-time based tests, this is the lower bound on the elapsed time of the
+// benchmark execution, regardless of number of threads.
+DEFINE_double(benchmark_min_time, 0.5);
+
+// The number of runs of each benchmark. If greater than 1, the mean and
+// standard deviation of the runs will be reported.
+DEFINE_int32(benchmark_repetitions, 1);
+
+// Report the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are reported for
+// repeated benchmarks. Affects all reporters.
+DEFINE_bool(benchmark_report_aggregates_only, false);
+
+// Display the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are displayed for
+// repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
+// the display reporter, but  *NOT* file reporter, which will still contain
+// all the output.
+DEFINE_bool(benchmark_display_aggregates_only, false);
+
+// The format to use for console output.
+// Valid values are 'console', 'json', or 'csv'.
+DEFINE_string(benchmark_format, "console");
+
+// The format to use for file output.
+// Valid values are 'console', 'json', or 'csv'.
+DEFINE_string(benchmark_out_format, "json");
+
+// The file to write additional output to.
+DEFINE_string(benchmark_out, "");
+
+// Whether to use colors in the output.  Valid values:
+// 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
+// the output is being sent to a terminal and the TERM environment variable is
+// set to a terminal type that supports colors.
+DEFINE_string(benchmark_color, "auto");
+
+// Whether to use tabular format when printing user counters to the console.
+// Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
+DEFINE_bool(benchmark_counters_tabular, false);
+
+// The level of verbose logging to output
+DEFINE_int32(v, 0);
 
 namespace benchmark {
 
 namespace internal {
 
+// FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
-// NOTE: This is a dummy "mutex" type used to denote the actual mutex
-// returned by GetBenchmarkLock(). This is only used to placate the thread
-// safety warnings by giving the return of GetBenchmarkLock() a name.
-struct CAPABILITY("mutex") BenchmarkLockType {};
-BenchmarkLockType BenchmarkLockVar;
-
-} // end namespace internal
-
-inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
-GetBenchmarkLock()
-{
-  static Mutex lock;
-  return lock;
-}
-
-namespace {
-
-bool IsZero(double n) {
-    return std::abs(n) < std::numeric_limits<double>::epsilon();
-}
-
-// For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
-static const size_t kMaxIterations = 1000000000;
-
-bool running_benchmark = false;
-
-// Global variable so that a benchmark can cause a little extra printing
-std::string* GetReportLabel() {
-    static std::string label GUARDED_BY(GetBenchmarkLock());
-    return &label;
-}
-
-// Global variable so that a benchmark can report an error as a human readable
-// string. If error_message is null no error occurred.
-#if defined(_MSC_VER) && _MSC_VER <= 1800
-typedef char* error_message_type;
-#else
-typedef const char* error_message_type;
-#endif
-
-static std::atomic<error_message_type> error_message = ATOMIC_VAR_INIT(nullptr);
-
-// TODO(ericwf): support MallocCounter.
-//static benchmark::MallocCounter *benchmark_mc;
-
-struct ThreadStats {
-    ThreadStats() : bytes_processed(0), items_processed(0), complexity_n(0) {}
-    int64_t bytes_processed;
-    int64_t items_processed;
-    int  complexity_n;
-};
-
-// Timer management class
-class TimerManager {
- public:
-  TimerManager(int num_threads, Notification* done)
-      : num_threads_(num_threads),
-        running_threads_(num_threads),
-        done_(done),
-        running_(false),
-        real_time_used_(0),
-        cpu_time_used_(0),
-        manual_time_used_(0),
-        num_finalized_(0),
-        phase_number_(0),
-        entered_(0)
-  {
-  }
-
-  // Called by each thread
-  void StartTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(!running_) << "Called StartTimer when timer is already running";
-        running_ = true;
-        start_real_time_ = walltime::Now();
-        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
-       }
-     }
-     if (last_thread) {
-       phase_condition_.notify_all();
-     }
-  }
-
-  // Called by each thread
-  void StopTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(running_) << "Called StopTimer when timer is already stopped";
-        InternalStop();
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
-
-  // Called by each thread
-  void SetIterationTime(double seconds) EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        manual_time_used_ += seconds;
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
-
-  // Called by each thread
-  void Finalize() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    num_finalized_++;
-    if (num_finalized_ == num_threads_) {
-      CHECK(!running_) <<
-        "The timer should be stopped before the timer is finalized";
-      done_->Notify();
-    }
-  }
-
-  void RemoveErroredThread() EXCLUDES(lock_) {
-    MutexLock ml(lock_);
-    int last_thread = --running_threads_ == 0;
-    if (last_thread && running_)
-      InternalStop();
-    else if (!last_thread)
-      phase_condition_.notify_all();
-  }
-
-  // REQUIRES: timer is not running
-  double real_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    CHECK(!running_);
-    return real_time_used_;
-  }
-
-  // REQUIRES: timer is not running
-  double cpu_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    CHECK(!running_);
-    return cpu_time_used_;
-  }
-
-  // REQUIRES: timer is not running
-  double manual_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    CHECK(!running_);
-    return manual_time_used_;
-  }
-
- private:
-  Mutex lock_;
-  Condition phase_condition_;
-  int num_threads_;
-  int running_threads_;
-  Notification* done_;
-
-  bool running_;                // Is the timer running
-  double start_real_time_;      // If running_
-  double start_cpu_time_;       // If running_
-
-  // Accumulated time so far (does not contain current slice if running_)
-  double real_time_used_;
-  double cpu_time_used_;
-  // Manually set iteration time. User sets this with SetIterationTime(seconds).
-  double manual_time_used_;
-
-  // How many threads have called Finalize()
-  int num_finalized_;
-
-  // State for barrier management
-  int phase_number_;
-  int entered_;         // Number of threads that have entered this barrier
-
-  void InternalStop() REQUIRES(lock_) {
-    CHECK(running_);
-    running_ = false;
-    real_time_used_ += walltime::Now() - start_real_time_;
-    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
-                       - start_cpu_time_);
-  }
-
-  // Enter the barrier and wait until all other threads have also
-  // entered the barrier.  Returns iff this is the last thread to
-  // enter the barrier.
-  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
-    entered_++;
-    if (entered_ < running_threads_) {
-      // Wait for all threads to enter
-      int phase_number_cp = phase_number_;
-      auto cb = [this, phase_number_cp]() {
-        return this->phase_number_ > phase_number_cp ||
-               entered_ == running_threads_; // A thread has aborted in error
-      };
-      phase_condition_.wait(ml.native_handle(), cb);
-      if (phase_number_ > phase_number_cp)
-        return false;
-      // else (running_threads_ == entered_) and we are the last thread.
-    }
-    // Last thread has reached the barrier
-    phase_number_++;
-    entered_ = 0;
-    return true;
-  }
-};
-
-// TimerManager for current run.
-static std::unique_ptr<TimerManager> timer_manager = nullptr;
-
-} // end namespace
-
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  std::string    name;
-  Benchmark*     benchmark;
-  bool           has_arg1;
-  int            arg1;
-  bool           has_arg2;
-  int            arg2;
-  TimeUnit       time_unit;
-  int            range_multiplier;
-  bool           use_real_time;
-  bool           use_manual_time;
-  BigO           complexity;
-  BigOFunc*      complexity_lambda;
-  bool           last_benchmark_instance;
-  int            repetitions;
-  double         min_time;
-  int            threads;    // Number of concurrent threads to use
-  bool           multithreaded;  // Is benchmark multi-threaded?
-};
-
-// Class for managing registered benchmarks.  Note that each registered
-// benchmark identifies a family of related benchmarks to run.
-class BenchmarkFamilies {
- public:
-  static BenchmarkFamilies* GetInstance();
-
-  // Registers a benchmark family and returns the index assigned to it.
-  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
-
-  // Extract the list of benchmark instances that match the specified
-  // regular expression.
-  bool FindBenchmarks(const std::string& re,
-                      std::vector<Benchmark::Instance>* benchmarks);
- private:
-  BenchmarkFamilies() {}
-
-  std::vector<std::unique_ptr<Benchmark>> families_;
-  Mutex mutex_;
-};
-
-
-class BenchmarkImp {
-public:
-  explicit BenchmarkImp(const char* name);
-  ~BenchmarkImp();
-
-  void Arg(int x);
-  void Unit(TimeUnit unit);
-  void Range(int start, int limit);
-  void DenseRange(int start, int limit);
-  void ArgPair(int start, int limit);
-  void RangePair(int lo1, int hi1, int lo2, int hi2);
-  void RangeMultiplier(int multiplier);
-  void MinTime(double n);
-  void Repetitions(int n);
-  void UseRealTime();
-  void UseManualTime();
-  void Complexity(BigO complexity);
-  void ComplexityLambda(BigOFunc* complexity);
-  void Threads(int t);
-  void ThreadRange(int min_threads, int max_threads);
-  void ThreadPerCpu();
-  void SetName(const char* name);
-
-  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-
-private:
-  friend class BenchmarkFamilies;
-
-  std::string name_;
-  int arg_count_;
-  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
-  TimeUnit time_unit_;
-  int range_multiplier_;
-  double min_time_;
-  int repetitions_;
-  bool use_real_time_;
-  bool use_manual_time_;
-  BigO complexity_;
-  BigOFunc* complexity_lambda_;
-  std::vector<int> thread_counts_;
-
-  BenchmarkImp& operator=(BenchmarkImp const&);
-};
-
-BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
-  static BenchmarkFamilies instance;
-  return &instance;
-}
-
-
-size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
-  MutexLock l(mutex_);
-  size_t index = families_.size();
-  families_.push_back(std::move(family));
-  return index;
-}
-
-bool BenchmarkFamilies::FindBenchmarks(
-    const std::string& spec,
-    std::vector<Benchmark::Instance>* benchmarks) {
-  // Make regular expression out of command-line flag
-  std::string error_msg;
-  Regex re;
-  if (!re.Init(spec, &error_msg)) {
-    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
-    return false;
-  }
-
-  // Special list of thread counts to use when none are specified
-  std::vector<int> one_thread;
-  one_thread.push_back(1);
-
-  MutexLock l(mutex_);
-  for (std::unique_ptr<Benchmark>& bench_family : families_) {
-    // Family was deleted or benchmark doesn't match
-    if (!bench_family) continue;
-    BenchmarkImp* family = bench_family->imp_;
-
-    if (family->arg_count_ == -1) {
-      family->arg_count_ = 0;
-      family->args_.emplace_back(-1, -1);
-    }
-    for (auto const& args : family->args_) {
-      const std::vector<int>* thread_counts =
-        (family->thread_counts_.empty()
-         ? &one_thread
-         : &family->thread_counts_);
-      for (int num_threads : *thread_counts) {
-
-        Benchmark::Instance instance;
-        instance.name = family->name_;
-        instance.benchmark = bench_family.get();
-        instance.has_arg1 = family->arg_count_ >= 1;
-        instance.arg1 = args.first;
-        instance.has_arg2 = family->arg_count_ == 2;
-        instance.arg2 = args.second;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.repetitions = family->repetitions_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.threads = num_threads;
-        instance.multithreaded = !(family->thread_counts_.empty());
-
-        // Add arguments to instance name
-        if (family->arg_count_ >= 1) {
-          AppendHumanReadable(instance.arg1, &instance.name);
-        }
-        if (family->arg_count_ >= 2) {
-          AppendHumanReadable(instance.arg2, &instance.name);
-        }
-        if (!IsZero(family->min_time_)) {
-          instance.name +=  StringPrintF("/min_time:%0.3f",  family->min_time_);
-        }
-        if (family->repetitions_ != 0) {
-          instance.name +=  StringPrintF("/repeats:%d",  family->repetitions_);
-        }
-        if (family->use_manual_time_) {
-          instance.name +=  "/manual_time";
-        } else if (family->use_real_time_) {
-          instance.name +=  "/real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name += StringPrintF("/threads:%d", instance.threads);
-        }
-
-        if (re.Match(instance.name)) {
-          instance.last_benchmark_instance = (args == family->args_.back());
-          benchmarks->push_back(instance);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-BenchmarkImp::BenchmarkImp(const char* name)
-    : name_(name), arg_count_(-1), time_unit_(kNanosecond),
-      range_multiplier_(kRangeMultiplier), min_time_(0.0), repetitions_(0),
-      use_real_time_(false), use_manual_time_(false),
-      complexity_(oNone) {
-}
-
-BenchmarkImp::~BenchmarkImp() {
-}
-
-void BenchmarkImp::Arg(int x) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  args_.emplace_back(x, -1);
-}
-
-void BenchmarkImp::Unit(TimeUnit unit) {
-  time_unit_ = unit;
-}
-
-void BenchmarkImp::Range(int start, int limit) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  std::vector<int> arglist;
-  AddRange(&arglist, start, limit, range_multiplier_);
-
-  for (int i : arglist) {
-    args_.emplace_back(i, -1);
-  }
-}
-
-void BenchmarkImp::DenseRange(int start, int limit) {
-  CHECK(arg_count_ == -1 || arg_count_ == 1);
-  arg_count_ = 1;
-  CHECK_GE(start, 0);
-  CHECK_LE(start, limit);
-  for (int arg = start; arg <= limit; arg++) {
-    args_.emplace_back(arg, -1);
-  }
-}
-
-void BenchmarkImp::ArgPair(int x, int y) {
-  CHECK(arg_count_ == -1 || arg_count_ == 2);
-  arg_count_ = 2;
-  args_.emplace_back(x, y);
-}
-
-void BenchmarkImp::RangePair(int lo1, int hi1, int lo2, int hi2) {
-  CHECK(arg_count_ == -1 || arg_count_ == 2);
-  arg_count_ = 2;
-  std::vector<int> arglist1, arglist2;
-  AddRange(&arglist1, lo1, hi1, range_multiplier_);
-  AddRange(&arglist2, lo2, hi2, range_multiplier_);
-
-  for (int i : arglist1) {
-    for (int j : arglist2) {
-      args_.emplace_back(i, j);
-    }
-  }
-}
-
-void BenchmarkImp::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
-  range_multiplier_ = multiplier;
-}
-
-void BenchmarkImp::MinTime(double t) {
-  CHECK(t > 0.0);
-  min_time_ = t;
-}
-
-
-void BenchmarkImp::Repetitions(int n) {
-  CHECK(n > 0);
-  repetitions_ = n;
-}
-
-void BenchmarkImp::UseRealTime() {
-  CHECK(!use_manual_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_real_time_ = true;
-}
-
-void BenchmarkImp::UseManualTime() {
-  CHECK(!use_real_time_) << "Cannot set UseRealTime and UseManualTime simultaneously.";
-  use_manual_time_ = true;
-}
-
-void BenchmarkImp::Complexity(BigO complexity){
-  complexity_ = complexity;
-}
-
-void BenchmarkImp::ComplexityLambda(BigOFunc* complexity) {
-  complexity_lambda_ = complexity;
-}
-
-void BenchmarkImp::Threads(int t) {
-  CHECK_GT(t, 0);
-  thread_counts_.push_back(t);
-}
-
-void BenchmarkImp::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-
-  AddRange(&thread_counts_, min_threads, max_threads, 2);
-}
-
-void BenchmarkImp::ThreadPerCpu() {
-  static int num_cpus = NumCPUs();
-  thread_counts_.push_back(num_cpus);
-}
-
-void BenchmarkImp::SetName(const char* name) {
-  name_ = name;
-}
-
-void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+}  // namespace internal
 
-  // Add "lo"
-  dst->push_back(lo);
-
-  static const int kint32max = std::numeric_limits<int32_t>::max();
-
-  // Now space out the benchmarks in multiples of "mult"
-  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
-      dst->push_back(i);
-    }
-  }
-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
-    dst->push_back(hi);
-  }
-}
-
-Benchmark::Benchmark(const char* name)
-    : imp_(new BenchmarkImp(name))
-{
-}
-
-Benchmark::~Benchmark()  {
-  delete imp_;
-}
-
-Benchmark::Benchmark(Benchmark const& other)
-  : imp_(new BenchmarkImp(*other.imp_))
-{
-}
-
-Benchmark* Benchmark::Arg(int x) {
-  imp_->Arg(x);
-  return this;
-}
-
-Benchmark* Benchmark::Unit(TimeUnit unit) {
-  imp_->Unit(unit);
-  return this;
-}
-
-Benchmark* Benchmark::Range(int start, int limit) {
-  imp_->Range(start, limit);
-  return this;
-}
-
-Benchmark* Benchmark::DenseRange(int start, int limit) {
-  imp_->DenseRange(start, limit);
-  return this;
-}
-
-Benchmark* Benchmark::ArgPair(int x, int y) {
-  imp_->ArgPair(x, y);
-  return this;
-}
-
-Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
-  imp_->RangePair(lo1, hi1, lo2, hi2);
-  return this;
-}
-
-Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
-  custom_arguments(this);
-  return this;
-}
-
-Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  imp_->RangeMultiplier(multiplier);
-  return this;
-}
-
-
-Benchmark* Benchmark::Repetitions(int t) {
-  imp_->Repetitions(t);
-  return this;
-}
-
-Benchmark* Benchmark::MinTime(double t) {
-  imp_->MinTime(t);
-  return this;
-}
-
-Benchmark* Benchmark::UseRealTime() {
-  imp_->UseRealTime();
-  return this;
-}
-
-Benchmark* Benchmark::UseManualTime() {
-  imp_->UseManualTime();
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigO complexity) {
-  imp_->Complexity(complexity);
-  return this;
-}
-
-Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
-  imp_->Complexity(oLambda);
-  imp_->ComplexityLambda(complexity);
-  return this;
-}
-
-Benchmark* Benchmark::Threads(int t) {
-  imp_->Threads(t);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  imp_->ThreadRange(min_threads, max_threads);
-  return this;
-}
-
-Benchmark* Benchmark::ThreadPerCpu() {
-  imp_->ThreadPerCpu();
-  return this;
-}
-
-void Benchmark::SetName(const char* name) {
-  imp_->SetName(name);
-}
-
-void FunctionBenchmark::Run(State& st) {
-  func_(st);
-}
-
-} // end namespace internal
-
-namespace {
-
-// Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const benchmark::internal::Benchmark::Instance* b,
-                 size_t iters, int thread_id,
-                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
-  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id, b->threads);
-  b->benchmark->Run(st);
-  CHECK(st.iterations() == st.max_iterations) <<
-    "Benchmark returned before State::KeepRunning() returned false!";
-  {
-    MutexLock l(GetBenchmarkLock());
-    total->bytes_processed += st.bytes_processed();
-    total->items_processed += st.items_processed();
-    total->complexity_n += st.complexity_length_n();
-  }
-
-  timer_manager->Finalize();
-}
-
-void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
-                  BenchmarkReporter* br,
-                  std::vector<BenchmarkReporter::Run>& complexity_reports)
-  EXCLUDES(GetBenchmarkLock()) {
-  size_t iters = 1;
-
-  std::vector<BenchmarkReporter::Run> reports;
-
-  std::vector<std::thread> pool;
-  if (b.multithreaded)
-    pool.resize(b.threads);
-
-  const int repeats = b.repetitions != 0 ? b.repetitions
-                                         : FLAGS_benchmark_repetitions;
-  for (int i = 0; i < repeats; i++) {
-    std::string mem;
-    for (;;) {
-      // Try benchmark
-      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
-
-      {
-        MutexLock l(GetBenchmarkLock());
-        GetReportLabel()->clear();
-      }
-      error_message = nullptr;
-
-      Notification done;
-      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
-
-      ThreadStats total;
-      running_benchmark = true;
-      if (b.multithreaded) {
-        // If this is out first iteration of the while(true) loop then the
-        // threads haven't been started and can't be joined. Otherwise we need
-        // to join the thread before replacing them.
-        for (std::thread& thread : pool) {
-          if (thread.joinable())
-            thread.join();
-        }
-        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-            pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti), &total);
-        }
-      } else {
-        // Run directly in this thread
-        RunInThread(&b, iters, 0, &total);
-      }
-      done.WaitForNotification();
-      running_benchmark = false;
-
-      const double cpu_accumulated_time = timer_manager->cpu_time_used();
-      const double real_accumulated_time = timer_manager->real_time_used();
-      const double manual_accumulated_time = timer_manager->manual_time_used();
-      timer_manager.reset();
-
-      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
-              << real_accumulated_time << "\n";
-
-      // Base decisions off of real time if requested by this benchmark.
-      double seconds = cpu_accumulated_time;
-      if (b.use_manual_time) {
-          seconds = manual_accumulated_time;
-      } else if (b.use_real_time) {
-          seconds = real_accumulated_time;
-      }
-
-      std::string label;
-      {
-        MutexLock l(GetBenchmarkLock());
-        label = *GetReportLabel();
-      }
-      error_message_type error_msg = error_message;
-
-      const double min_time = !IsZero(b.min_time) ? b.min_time
-                                                  : FLAGS_benchmark_min_time;
-
-      // If this was the first run, was elapsed time or cpu time large enough?
-      // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || (error_msg != nullptr) ||
-          (iters >= kMaxIterations) ||
-          (seconds >= min_time) ||
-          (real_accumulated_time >= 5*min_time)) {
-
-        // Create report about this benchmark run.
-        BenchmarkReporter::Run report;
-        report.benchmark_name = b.name;
-        report.error_occurred = error_msg != nullptr;
-        report.error_message = error_msg != nullptr ? error_msg : "";
-        report.report_label = label;
-        // Report the total iterations across all threads.
-        report.iterations = static_cast<int64_t>(iters) * b.threads;
-        report.time_unit = b.time_unit;
-
-        if (!report.error_occurred) {
-          double bytes_per_second = 0;
-          if (total.bytes_processed > 0 && seconds > 0.0) {
-            bytes_per_second = (total.bytes_processed / seconds);
-          }
-          double items_per_second = 0;
-          if (total.items_processed > 0 && seconds > 0.0) {
-            items_per_second = (total.items_processed / seconds);
-          }
-
-          if (b.use_manual_time) {
-            report.real_accumulated_time = manual_accumulated_time;
-          } else {
-            report.real_accumulated_time = real_accumulated_time;
-          }
-          report.cpu_accumulated_time = cpu_accumulated_time;
-          report.bytes_per_second = bytes_per_second;
-          report.items_per_second = items_per_second;
-          report.complexity_n = total.complexity_n;
-          report.complexity = b.complexity;
-          report.complexity_lambda = b.complexity_lambda;
-          if(report.complexity != oNone)
-            complexity_reports.push_back(report);
-        }
-
-        reports.push_back(report);
-        break;
-      }
-
-      // See how much iterations should be increased by
-      // Note: Avoid division by zero with max(seconds, 1ns).
-      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
-      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-      // use the multiplier directly. Otherwise we use at most 10 times
-      // expansion.
-      // NOTE: When the last run was at least 10% of the min time the max
-      // expansion should be 14x.
-      bool is_significant = (seconds / min_time) > 0.1;
-      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-      if (multiplier <= 1.0) multiplier = 2.0;
-      double next_iters = std::max(multiplier * iters, iters + 1.0);
-      if (next_iters > kMaxIterations) {
-        next_iters = kMaxIterations;
-      }
-      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-      iters = static_cast<int>(next_iters + 0.5);
-    }
-  }
-  std::vector<BenchmarkReporter::Run> additional_run_stats = ComputeStats(reports);
-  reports.insert(reports.end(), additional_run_stats.begin(),
-                 additional_run_stats.end());
-
-  if((b.complexity != oNone) && b.last_benchmark_instance) {
-    additional_run_stats = ComputeBigO(complexity_reports);
-    reports.insert(reports.end(), additional_run_stats.begin(),
-                   additional_run_stats.end());
-    complexity_reports.clear();
-  }
-
-  br->ReportRuns(reports);
-
-  if (b.multithreaded) {
-    for (std::thread& thread : pool)
-      thread.join();
-  }
-}
-
-}  // namespace
-
-State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
-             int thread_i, int n_threads)
-    : started_(false), finished_(false), total_iterations_(0),
-      has_range_x_(has_x), range_x_(x),
-      has_range_y_(has_y), range_y_(y),
-      bytes_processed_(0), items_processed_(0),
-      complexity_n_(0),
+State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+             int thread_i, int n_threads, internal::ThreadTimer* timer,
+             internal::ThreadManager* manager)
+    : total_iterations_(0),
+      batch_leftover_(0),
+      max_iterations(max_iters),
+      started_(false),
+      finished_(false),
       error_occurred_(false),
+      range_(ranges),
+      complexity_n_(0),
+      counters(),
       thread_index(thread_i),
       threads(n_threads),
-      max_iterations(max_iters)
-{
-    CHECK(max_iterations != 0) << "At least one iteration must be run";
-    CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      timer_(timer),
+      manager_(manager) {
+  CHECK(max_iterations != 0) << "At least one iteration must be run";
+  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+
+  // Note: The use of offsetof below is technically undefined until C++17
+  // because State is not a standard layout type. However, all compilers
+  // currently provide well-defined behavior as an extension (which is
+  // demonstrated since constexpr evaluation must diagnose all undefined
+  // behavior). However, GCC and Clang also warn about this use of offsetof,
+  // which must be suppressed.
+#if defined(__INTEL_COMPILER)
+#pragma warning push
+#pragma warning(disable : 1875)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#endif
+  // Offset tests to ensure commonly accessed data is on the first cache line.
+  const int cache_line_size = 64;
+  static_assert(offsetof(State, error_occurred_) <=
+                    (cache_line_size - sizeof(error_occurred_)),
+                "");
+#if defined(__INTEL_COMPILER)
+#pragma warning pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 }
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  CHECK(running_benchmark);
   CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StopTimer();
+  timer_->StopTimer();
 }
 
 void State::ResumeTiming() {
-  CHECK(running_benchmark);
   CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StartTimer();
+  timer_->StartTimer();
 }
 
 void State::SkipWithError(const char* msg) {
   CHECK(msg);
   error_occurred_ = true;
-  error_message_type expected_no_error_msg = nullptr;
-  error_message.compare_exchange_weak(expected_no_error_msg,
-    const_cast<error_message_type>(msg));
-  started_ = finished_ = true;
-  total_iterations_ = max_iterations;
-  timer_manager->RemoveErroredThread();
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (manager_->results.has_error_ == false) {
+      manager_->results.error_message_ = msg;
+      manager_->results.has_error_ = true;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
 }
 
-void State::SetIterationTime(double seconds)
-{
-  CHECK(running_benchmark);
-  timer_manager->SetIterationTime(seconds);
+void State::SetIterationTime(double seconds) {
+  timer_->SetIterationTime(seconds);
 }
 
 void State::SetLabel(const char* label) {
-  CHECK(running_benchmark);
-  MutexLock l(GetBenchmarkLock());
-  *GetReportLabel() = label;
+  MutexLock l(manager_->GetBenchmarkMutex());
+  manager_->results.report_label_ = label;
+}
+
+void State::StartKeepRunning() {
+  CHECK(!started_ && !finished_);
+  started_ = true;
+  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  manager_->StartStopBarrier();
+  if (!error_occurred_) ResumeTiming();
+}
+
+void State::FinishKeepRunning() {
+  CHECK(started_ && (!finished_ || error_occurred_));
+  if (!error_occurred_) {
+    PauseTiming();
+  }
+  // Total iterations has now wrapped around past 0. Fix this.
+  total_iterations_ = 0;
+  finished_ = true;
+  manager_->StartStopBarrier();
 }
 
 namespace internal {
 namespace {
 
-void RunMatchingBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
-                           BenchmarkReporter* reporter) {
-  CHECK(reporter != nullptr);
+void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
+                   BenchmarkReporter* display_reporter,
+                   BenchmarkReporter* file_reporter) {
+  // Note the file_reporter can be null.
+  CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
-  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
+  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
   size_t name_field_width = 10;
-  for (const Benchmark::Instance& benchmark : benchmarks) {
+  size_t stat_field_width = 0;
+  for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.size());
-    has_repetitions |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name.str().size());
+    might_have_aggregates |= benchmark.repetitions > 1;
+
+    for (const auto& Stat : *benchmark.statistics)
+      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
-  if (has_repetitions)
-    name_field_width += std::strlen("_stddev");
+  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
 
   // Print header here
   BenchmarkReporter::Context context;
-  context.num_cpus = NumCPUs();
-  context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
-
-  context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;
 
-  // Keep track of runing times of all instances of current benchmark
+  // Keep track of running times of all instances of current benchmark
   std::vector<BenchmarkReporter::Run> complexity_reports;
 
-  if (reporter->ReportContext(context)) {
+  // We flush streams after invoking reporter methods that write to them. This
+  // ensures users get timely updates even when streams are not line-buffered.
+  auto flushStreams = [](BenchmarkReporter* reporter) {
+    if (!reporter) return;
+    std::flush(reporter->GetOutputStream());
+    std::flush(reporter->GetErrorStream());
+  };
+
+  if (display_reporter->ReportContext(context) &&
+      (!file_reporter || file_reporter->ReportContext(context))) {
+    flushStreams(display_reporter);
+    flushStreams(file_reporter);
+
     for (const auto& benchmark : benchmarks) {
-      RunBenchmark(benchmark, reporter, complexity_reports);
+      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+
+      auto report = [&run_results](BenchmarkReporter* reporter,
+                                   bool report_aggregates_only) {
+        assert(reporter);
+        // If there are no aggregates, do output non-aggregates.
+        report_aggregates_only &= !run_results.aggregates_only.empty();
+        if (!report_aggregates_only)
+          reporter->ReportRuns(run_results.non_aggregates);
+        if (!run_results.aggregates_only.empty())
+          reporter->ReportRuns(run_results.aggregates_only);
+      };
+
+      report(display_reporter, run_results.display_report_aggregates_only);
+      if (file_reporter)
+        report(file_reporter, run_results.file_report_aggregates_only);
+
+      flushStreams(display_reporter);
+      flushStreams(file_reporter);
     }
   }
+  display_reporter->Finalize();
+  if (file_reporter) file_reporter->Finalize();
+  flushStreams(display_reporter);
+  flushStreams(file_reporter);
 }
 
-std::unique_ptr<BenchmarkReporter> GetDefaultReporter() {
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+std::unique_ptr<BenchmarkReporter> CreateReporter(
+    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
-  if (FLAGS_benchmark_format == "console") {
-    return PtrType(new ConsoleReporter);
-  } else if (FLAGS_benchmark_format == "json") {
+  if (name == "console") {
+    return PtrType(new ConsoleReporter(output_opts));
+  } else if (name == "json") {
     return PtrType(new JSONReporter);
-  } else if (FLAGS_benchmark_format == "csv") {
+  } else if (name == "csv") {
     return PtrType(new CSVReporter);
   } else {
-    std::cerr << "Unexpected format: '" << FLAGS_benchmark_format << "'\n";
+    std::cerr << "Unexpected format: '" << name << "'\n";
     std::exit(1);
   }
 }
 
-} // end namespace
-} // end namespace internal
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+}  // end namespace
+
+bool IsZero(double n) {
+  return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
+  int output_opts = ConsoleReporter::OO_Defaults;
+  auto is_benchmark_color = [force_no_color]() -> bool {
+    if (force_no_color) {
+      return false;
+    }
+    if (FLAGS_benchmark_color == "auto") {
+      return IsColorTerminal();
+    }
+    return IsTruthyFlagValue(FLAGS_benchmark_color);
+  };
+  if (is_benchmark_color()) {
+    output_opts |= ConsoleReporter::OO_Color;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Color;
+  }
+  if (FLAGS_benchmark_counters_tabular) {
+    output_opts |= ConsoleReporter::OO_Tabular;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Tabular;
+  }
+  return static_cast<ConsoleReporter::OutputOptions>(output_opts);
+}
+
+}  // end namespace internal
 
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* reporter) {
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  auto families = internal::BenchmarkFamilies::GetInstance();
-  if (!families->FindBenchmarks(spec, &benchmarks)) return 0;
+  // Setup the reporters
+  std::ofstream output_file;
+  std::unique_ptr<BenchmarkReporter> default_display_reporter;
+  std::unique_ptr<BenchmarkReporter> default_file_reporter;
+  if (!display_reporter) {
+    default_display_reporter = internal::CreateReporter(
+        FLAGS_benchmark_format, internal::GetOutputOptions());
+    display_reporter = default_display_reporter.get();
+  }
+  auto& Out = display_reporter->GetOutputStream();
+  auto& Err = display_reporter->GetErrorStream();
+
+  std::string const& fname = FLAGS_benchmark_out;
+  if (fname.empty() && file_reporter) {
+    Err << "A custom file reporter was provided but "
+           "--benchmark_out=<file> was not specified."
+        << std::endl;
+    std::exit(1);
+  }
+  if (!fname.empty()) {
+    output_file.open(fname);
+    if (!output_file.is_open()) {
+      Err << "invalid file name: '" << fname << std::endl;
+      std::exit(1);
+    }
+    if (!file_reporter) {
+      default_file_reporter = internal::CreateReporter(
+          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+      file_reporter = default_file_reporter.get();
+    }
+    file_reporter->SetOutputStream(&output_file);
+    file_reporter->SetErrorStream(&output_file);
+  }
+
+  std::vector<internal::BenchmarkInstance> benchmarks;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+
+  if (benchmarks.empty()) {
+    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    return 0;
+  }
 
   if (FLAGS_benchmark_list_tests) {
     for (auto const& benchmark : benchmarks)
-      std::cout <<  benchmark.name << "\n";
+      Out << benchmark.name.str() << "\n";
   } else {
-    std::unique_ptr<BenchmarkReporter> default_reporter;
-    if (!reporter) {
-      default_reporter = internal::GetDefaultReporter();
-      reporter = default_reporter.get();
-    }
-    internal::RunMatchingBenchmarks(benchmarks, reporter);
-    reporter->Finalize();
+    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
+
   return benchmarks.size();
 }
 
+void RegisterMemoryManager(MemoryManager* manager) {
+  internal::memory_manager = manager;
+}
+
 namespace internal {
 
 void PrintUsageAndExit() {
@@ -1061,30 +420,45 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--color_print={true|false}]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
           "          [--v=<verbosity>]\n");
   exit(0);
 }
 
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
-  for (int i = 1; i < *argc; ++i) {
-    if (
-        ParseBoolFlag(argv[i], "benchmark_list_tests",
+  BenchmarkReporter::Context::executable_name =
+      (argc && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; argc && i < *argc; ++i) {
+    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
-        ParseStringFlag(argv[i], "benchmark_filter",
-                        &FLAGS_benchmark_filter) ||
+        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
         ParseDoubleFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
-        ParseStringFlag(argv[i], "benchmark_format",
-                        &FLAGS_benchmark_format) ||
-        ParseBoolFlag(argv[i], "color_print",
-                       &FLAGS_color_print) ||
+        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
+                      &FLAGS_benchmark_report_aggregates_only) ||
+        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
+                      &FLAGS_benchmark_display_aggregates_only) ||
+        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
+        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
+        ParseStringFlag(argv[i], "benchmark_out_format",
+                        &FLAGS_benchmark_out_format) ||
+        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
+        // "color_print" is the deprecated name for "benchmark_color".
+        // TODO: Remove this.
+        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
+        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
+                      &FLAGS_benchmark_counters_tabular) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
       --(*argc);
       --i;
@@ -1092,32 +466,34 @@ void ParseCommandLineFlags(int* argc, char** argv) {
       PrintUsageAndExit();
     }
   }
-
-  if (FLAGS_benchmark_format != "console" &&
-      FLAGS_benchmark_format != "json" &&
-      FLAGS_benchmark_format != "csv") {
+  for (auto const* flag :
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+    if (*flag != "console" && *flag != "json" && *flag != "csv") {
+      PrintUsageAndExit();
+    }
+  if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
 }
 
-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-    std::unique_ptr<Benchmark> bench_ptr(bench);
-    BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-    families->AddBenchmark(std::move(bench_ptr));
-    return bench;
+int InitializeStreams() {
+  static std::ios_base::Init init;
+  return 0;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
 void Initialize(int* argc, char** argv) {
   internal::ParseCommandLineFlags(argc, argv);
-  internal::SetLogLevel(FLAGS_v);
-  // TODO remove this. It prints some output the first time it is called.
-  // We don't want to have this ouput printed during benchmarking.
-  MyCPUUsage();
-  // The first call to walltime::Now initialized it. Call it once to
-  // prevent the initialization from happening in a benchmark.
-  walltime::Now();
+  internal::LogLevel() = FLAGS_v;
+}
+
+bool ReportUnrecognizedArguments(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
+            argv[i]);
+  }
+  return argc > 1;
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
new file mode 100644
index 0000000..d468a25
--- /dev/null
+++ b/src/benchmark_api_internal.cc
@@ -0,0 +1,15 @@
+#include "benchmark_api_internal.h"
+
+namespace benchmark {
+namespace internal {
+
+State BenchmarkInstance::Run(IterationCount iters, int thread_id,
+                             internal::ThreadTimer* timer,
+                             internal::ThreadManager* manager) const {
+  State st(iters, arg, thread_id, threads, timer, manager);
+  benchmark->Run(st);
+  return st;
+}
+
+}  // internal
+}  // benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
new file mode 100644
index 0000000..264eff9
--- /dev/null
+++ b/src/benchmark_api_internal.h
@@ -0,0 +1,53 @@
+#ifndef BENCHMARK_API_INTERNAL_H
+#define BENCHMARK_API_INTERNAL_H
+
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
+#include <cmath>
+#include <iosfwd>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+namespace internal {
+
+// Information kept per benchmark we may want to run
+struct BenchmarkInstance {
+  BenchmarkName name;
+  Benchmark* benchmark;
+  AggregationReportMode aggregation_report_mode;
+  std::vector<int64_t> arg;
+  TimeUnit time_unit;
+  int range_multiplier;
+  bool measure_process_cpu_time;
+  bool use_real_time;
+  bool use_manual_time;
+  BigO complexity;
+  BigOFunc* complexity_lambda;
+  UserCounters counters;
+  const std::vector<Statistics>* statistics;
+  bool last_benchmark_instance;
+  int repetitions;
+  double min_time;
+  IterationCount iterations;
+  int threads;  // Number of concurrent threads to us
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager) const;
+};
+
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<BenchmarkInstance>* benchmarks,
+                            std::ostream* Err);
+
+bool IsZero(double n);
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/src/benchmark_main.cc b/src/benchmark_main.cc
new file mode 100644
index 0000000..b3b2478
--- /dev/null
+++ b/src/benchmark_main.cc
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_MAIN();
diff --git a/src/benchmark_name.cc b/src/benchmark_name.cc
new file mode 100644
index 0000000..2a17ebc
--- /dev/null
+++ b/src/benchmark_name.cc
@@ -0,0 +1,58 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO: use absl::StrJoin
+void join_impl(std::string&, char) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, iterations, repetitions,
+              time_type, threads);
+}
+}  // namespace benchmark
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
new file mode 100644
index 0000000..65d9944
--- /dev/null
+++ b/src/benchmark_register.cc
@@ -0,0 +1,515 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_register.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <sstream>
+#include <thread>
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "check.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "timers.h"
+
+namespace benchmark {
+
+namespace {
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+// The size of a benchmark family determines is the number of inputs to repeat
+// the benchmark on. If this is "large" then warn the user during configuration.
+static const size_t kMaxFamilySize = 100;
+}  // end namespace
+
+namespace internal {
+
+//=============================================================================//
+//                         BenchmarkFamilies
+//=============================================================================//
+
+// Class for managing registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+class BenchmarkFamilies {
+ public:
+  static BenchmarkFamilies* GetInstance();
+
+  // Registers a benchmark family and returns the index assigned to it.
+  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+
+  // Clear all registered benchmark families.
+  void ClearBenchmarks();
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  bool FindBenchmarks(std::string re,
+                      std::vector<BenchmarkInstance>* benchmarks,
+                      std::ostream* Err);
+
+ private:
+  BenchmarkFamilies() {}
+
+  std::vector<std::unique_ptr<Benchmark>> families_;
+  Mutex mutex_;
+};
+
+BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
+  static BenchmarkFamilies instance;
+  return &instance;
+}
+
+size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+  MutexLock l(mutex_);
+  size_t index = families_.size();
+  families_.push_back(std::move(family));
+  return index;
+}
+
+void BenchmarkFamilies::ClearBenchmarks() {
+  MutexLock l(mutex_);
+  families_.clear();
+  families_.shrink_to_fit();
+}
+
+bool BenchmarkFamilies::FindBenchmarks(
+    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
+    std::ostream* ErrStream) {
+  CHECK(ErrStream);
+  auto& Err = *ErrStream;
+  // Make regular expression out of command-line flag
+  std::string error_msg;
+  Regex re;
+  bool isNegativeFilter = false;
+  if (spec[0] == '-') {
+    spec.replace(0, 1, "");
+    isNegativeFilter = true;
+  }
+  if (!re.Init(spec, &error_msg)) {
+    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
+  }
+
+  // Special list of thread counts to use when none are specified
+  const std::vector<int> one_thread = {1};
+
+  MutexLock l(mutex_);
+  for (std::unique_ptr<Benchmark>& family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (!family) continue;
+
+    if (family->ArgsCnt() == -1) {
+      family->Args({});
+    }
+    const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+             ? &one_thread
+             : &static_cast<const std::vector<int>&>(family->thread_counts_));
+    const size_t family_size = family->args_.size() * thread_counts->size();
+    // The benchmark will be run at least 'family_size' different inputs.
+    // If 'family_size' is very large warn the user.
+    if (family_size > kMaxFamilySize) {
+      Err << "The number of inputs is very large. " << family->name_
+          << " will be repeated at least " << family_size << " times.\n";
+    }
+    // reserve in the special case the regex ".", since we know the final
+    // family size.
+    if (spec == ".") benchmarks->reserve(family_size);
+
+    for (auto const& args : family->args_) {
+      for (int num_threads : *thread_counts) {
+        BenchmarkInstance instance;
+        instance.name.function_name = family->name_;
+        instance.benchmark = family.get();
+        instance.aggregation_report_mode = family->aggregation_report_mode_;
+        instance.arg = args;
+        instance.time_unit = family->time_unit_;
+        instance.range_multiplier = family->range_multiplier_;
+        instance.min_time = family->min_time_;
+        instance.iterations = family->iterations_;
+        instance.repetitions = family->repetitions_;
+        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
+        instance.use_real_time = family->use_real_time_;
+        instance.use_manual_time = family->use_manual_time_;
+        instance.complexity = family->complexity_;
+        instance.complexity_lambda = family->complexity_lambda_;
+        instance.statistics = &family->statistics_;
+        instance.threads = num_threads;
+
+        // Add arguments to instance name
+        size_t arg_i = 0;
+        for (auto const& arg : args) {
+          if (!instance.name.args.empty()) {
+            instance.name.args += '/';
+          }
+
+          if (arg_i < family->arg_names_.size()) {
+            const auto& arg_name = family->arg_names_[arg_i];
+            if (!arg_name.empty()) {
+              instance.name.args += StrFormat("%s:", arg_name.c_str());
+            }
+          }
+
+          instance.name.args += StrFormat("%" PRId64, arg);
+          ++arg_i;
+        }
+
+        if (!IsZero(family->min_time_))
+          instance.name.min_time =
+              StrFormat("min_time:%0.3f", family->min_time_);
+        if (family->iterations_ != 0) {
+          instance.name.iterations =
+              StrFormat("iterations:%lu",
+                        static_cast<unsigned long>(family->iterations_));
+        }
+        if (family->repetitions_ != 0)
+          instance.name.repetitions =
+              StrFormat("repeats:%d", family->repetitions_);
+
+        if (family->measure_process_cpu_time_) {
+          instance.name.time_type = "process_time";
+        }
+
+        if (family->use_manual_time_) {
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "manual_time";
+        } else if (family->use_real_time_) {
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "real_time";
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name.threads = StrFormat("threads:%d", instance.threads);
+        }
+
+        const auto full_name = instance.name.str();
+        if ((re.Match(full_name) && !isNegativeFilter) ||
+            (!re.Match(full_name) && isNegativeFilter)) {
+          instance.last_benchmark_instance = (&args == &family->args_.back());
+          benchmarks->push_back(std::move(instance));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
+  std::unique_ptr<Benchmark> bench_ptr(bench);
+  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
+  families->AddBenchmark(std::move(bench_ptr));
+  return bench;
+}
+
+// FIXME: This function is a hack so that benchmark.cc can access
+// `BenchmarkFamilies`
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<BenchmarkInstance>* benchmarks,
+                            std::ostream* Err) {
+  return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
+}
+
+//=============================================================================//
+//                               Benchmark
+//=============================================================================//
+
+Benchmark::Benchmark(const char* name)
+    : name_(name),
+      aggregation_report_mode_(ARM_Unspecified),
+      time_unit_(kNanosecond),
+      range_multiplier_(kRangeMultiplier),
+      min_time_(0),
+      iterations_(0),
+      repetitions_(0),
+      measure_process_cpu_time_(false),
+      use_real_time_(false),
+      use_manual_time_(false),
+      complexity_(oNone),
+      complexity_lambda_(nullptr) {
+  ComputeStatistics("mean", StatisticsMean);
+  ComputeStatistics("median", StatisticsMedian);
+  ComputeStatistics("stddev", StatisticsStdDev);
+}
+
+Benchmark::~Benchmark() {}
+
+Benchmark* Benchmark::Arg(int64_t x) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  args_.push_back({x});
+  return this;
+}
+
+Benchmark* Benchmark::Unit(TimeUnit unit) {
+  time_unit_ = unit;
+  return this;
+}
+
+Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  std::vector<int64_t> arglist;
+  AddRange(&arglist, start, limit, range_multiplier_);
+
+  for (int64_t i : arglist) {
+    args_.push_back({i});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Ranges(
+    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  std::vector<std::vector<int64_t>> arglists(ranges.size());
+  for (std::size_t i = 0; i < ranges.size(); i++) {
+    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
+             range_multiplier_);
+  }
+
+  ArgsProduct(arglists);
+
+  return this;
+}
+
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
+  for (std::size_t i = 0; i < total; i++) {
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
+    }
+    args_.push_back(args);
+    args.clear();
+
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
+  }
+
+  return this;
+}
+
+Benchmark* Benchmark::ArgName(const std::string& name) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  arg_names_ = {name};
+  return this;
+}
+
+Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  arg_names_ = names;
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  CHECK_LE(start, limit);
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args_.push_back({arg});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  args_.push_back(args);
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::RangeMultiplier(int multiplier) {
+  CHECK(multiplier > 1);
+  range_multiplier_ = multiplier;
+  return this;
+}
+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  CHECK(iterations_ == 0);
+  min_time_ = t;
+  return this;
+}
+
+Benchmark* Benchmark::Iterations(IterationCount n) {
+  CHECK(n > 0);
+  CHECK(IsZero(min_time_));
+  iterations_ = n;
+  return this;
+}
+
+Benchmark* Benchmark::Repetitions(int n) {
+  CHECK(n > 0);
+  repetitions_ = n;
+  return this;
+}
+
+Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
+  aggregation_report_mode_ = value ? ARM_ReportAggregatesOnly : ARM_Default;
+  return this;
+}
+
+Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
+  // If we were called, the report mode is no longer 'unspecified', in any case.
+  aggregation_report_mode_ = static_cast<AggregationReportMode>(
+      aggregation_report_mode_ | ARM_Default);
+
+  if (value) {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ | ARM_DisplayReportAggregatesOnly);
+  } else {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ & ~ARM_DisplayReportAggregatesOnly);
+  }
+
+  return this;
+}
+
+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::UseRealTime() {
+  CHECK(!use_manual_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_real_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::UseManualTime() {
+  CHECK(!use_real_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_manual_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigO complexity) {
+  complexity_ = complexity;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
+  complexity_lambda_ = complexity;
+  complexity_ = oLambda;
+  return this;
+}
+
+Benchmark* Benchmark::ComputeStatistics(std::string name,
+                                        StatisticsFunc* statistics) {
+  statistics_.emplace_back(name, statistics);
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  CHECK_GT(t, 0);
+  thread_counts_.push_back(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+  return this;
+}
+
+Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
+                                       int stride) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+  CHECK_GE(stride, 1);
+
+  for (auto i = min_threads; i < max_threads; i += stride) {
+    thread_counts_.push_back(i);
+  }
+  thread_counts_.push_back(max_threads);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  thread_counts_.push_back(CPUInfo::Get().num_cpus);
+  return this;
+}
+
+void Benchmark::SetName(const char* name) { name_ = name; }
+
+int Benchmark::ArgsCnt() const {
+  if (args_.empty()) {
+    if (arg_names_.empty()) return -1;
+    return static_cast<int>(arg_names_.size());
+  }
+  return static_cast<int>(args_.front().size());
+}
+
+//=============================================================================//
+//                            FunctionBenchmark
+//=============================================================================//
+
+void FunctionBenchmark::Run(State& st) { func_(st); }
+
+}  // end namespace internal
+
+void ClearRegisteredBenchmarks() {
+  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
+}
+
+}  // end namespace benchmark
diff --git a/src/benchmark_register.h b/src/benchmark_register.h
new file mode 100644
index 0000000..c774e6f
--- /dev/null
+++ b/src/benchmark_register.h
@@ -0,0 +1,108 @@
+#ifndef BENCHMARK_REGISTER_H
+#define BENCHMARK_REGISTER_H
+
+#include <limits>
+#include <vector>
+
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
+template <typename T>
+typename std::vector<T>::iterator
+AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  const size_t start_offset = dst->size();
+
+  static const T kmax = std::numeric_limits<T>::max();
+
+  // Space out the values in multiples of "mult"
+  for (T i = 1; i <= hi; i *= mult) {
+    if (i >= lo) {
+      dst->push_back(i);
+    }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + start_offset;
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  CHECK_GT(lo, std::numeric_limits<T>::min());
+  CHECK_GT(hi, std::numeric_limits<T>::min());
+  CHECK_GE(hi, lo);
+  CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::reverse(it, dst->end());
+}
+
+template <typename T>
+void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;
+
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
+  }
+
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo < 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
+    dst->push_back(hi);
+  }
+}
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_REGISTER_H
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
new file mode 100644
index 0000000..7bc6b63
--- /dev/null
+++ b/src/benchmark_runner.cc
@@ -0,0 +1,362 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static constexpr IterationCount kMaxIterations = 1000000000;
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name;
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit;
+  report.threads = b.threads;
+  report.repetition_index = repetition_index;
+  report.repetitions = b.repetitions;
+
+  if (!report.error_occurred) {
+    if (b.use_manual_time) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity;
+    report.complexity_lambda = b.complexity_lambda;
+    report.statistics = b.statistics;
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      report.has_memory_result = true;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+                                  memory_iterations
+                            : 0;
+      report.max_bytes_used = memory_result.max_bytes_used;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+  State st = b->Run(iters, thread_id, &timer, manager);
+  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
+      : b(b_),
+        complexity_reports(*complexity_reports_),
+        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
+        repeats(b.repetitions != 0 ? b.repetitions
+                                   : FLAGS_benchmark_repetitions),
+        has_explicit_iteration_count(b.iterations != 0),
+        pool(b.threads - 1),
+        iters(has_explicit_iteration_count ? b.iterations : 1) {
+    run_results.display_report_aggregates_only =
+        (FLAGS_benchmark_report_aggregates_only ||
+         FLAGS_benchmark_display_aggregates_only);
+    run_results.file_report_aggregates_only =
+        FLAGS_benchmark_report_aggregates_only;
+    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
+      run_results.display_report_aggregates_only =
+          (b.aggregation_report_mode &
+           internal::ARM_DisplayReportAggregatesOnly);
+      run_results.file_report_aggregates_only =
+          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+    }
+
+    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
+      DoOneRepetition(repetition_num);
+    }
+
+    // Calculate additional statistics
+    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+    // Maybe calculate complexity report
+    if ((b.complexity != oNone) && b.last_benchmark_instance) {
+      auto additional_run_stats = ComputeBigO(complexity_reports);
+      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                         additional_run_stats.begin(),
+                                         additional_run_stats.end());
+      complexity_reports.clear();
+    }
+  }
+
+  RunResults&& get_results() { return std::move(run_results); }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  std::vector<BenchmarkReporter::Run>& complexity_reports;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  std::vector<std::thread> pool;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations() {
+    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
+
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(b.threads));
+
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                             manager.get());
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    RunInThread(&b, iters, 0, manager.get());
+
+    // The main thread has finished. Now let's wait for the other threads.
+    manager->WaitForAllThreads();
+    for (std::thread& thread : pool) thread.join();
+
+    IterationResults i;
+    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+    {
+      MutexLock l(manager->GetBenchmarkMutex());
+      i.results = manager->results;
+    }
+
+    // And get rid of the manager.
+    manager.reset();
+
+    // Adjust real/manual time stats since they were reported per thread.
+    i.results.real_time_used /= b.threads;
+    i.results.manual_time_used /= b.threads;
+    // If we were measuring whole-process CPU usage, adjust the CPU time too.
+    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
+
+    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+            << i.results.real_time_used << "\n";
+
+    // So for how long were we running?
+    i.iters = iters;
+    // Base decisions off of real time if requested by this benchmark.
+    i.seconds = i.results.cpu_time_used;
+    if (b.use_manual_time) {
+      i.seconds = i.results.manual_time_used;
+    } else if (b.use_real_time) {
+      i.seconds = i.results.real_time_used;
+    }
+
+    return i;
+  }
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
+    // See how much iterations should be increased by.
+    // Note: Avoid division by zero with max(seconds, 1ns).
+    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+    // use the multiplier directly.
+    // Otherwise we use at most 10 times expansion.
+    // NOTE: When the last run was at least 10% of the min time the max
+    // expansion should be 14x.
+    bool is_significant = (i.seconds / min_time) > 0.1;
+    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+    if (multiplier <= 1.0) multiplier = 2.0;
+
+    // So what seems to be the sufficiently-large iteration count? Round up.
+    const IterationCount max_next_iters = static_cast<IterationCount>(
+        std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                             static_cast<double>(i.iters) + 1.0)));
+    // But we do have *some* sanity limits though..
+    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+    return next_iters;  // round up before conversion to integer.
+  }
+
+  bool ShouldReportIterationResults(const IterationResults& i) const {
+    // Determine if this run should be reported;
+    // Either it has run for a sufficient amount of time
+    // or because an error was reported.
+    return i.results.has_error_ ||
+           i.iters >= kMaxIterations ||  // Too many iterations already.
+           i.seconds >= min_time ||      // The elapsed time is large enough.
+           // CPU time is specified but the elapsed real time greatly exceeds
+           // the minimum time.
+           // Note that user provided timers are except from this sanity check.
+           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  }
+
+  void DoOneRepetition(int64_t repetition_index) {
+    const bool is_the_first_repetition = repetition_index == 0;
+    IterationResults i;
+
+    // We *may* be gradually increasing the length (iteration count)
+    // of the benchmark until we decide the results are significant.
+    // And once we do, we report those last results and exit.
+    // Please do note that the if there are repetitions, the iteration count
+    // is *only* calculated for the *first* repetition, and other repetitions
+    // simply use that precomputed iteration count.
+    for (;;) {
+      i = DoNIterations();
+
+      // Do we consider the results to be significant?
+      // If we are doing repetitions, and the first repetition was already done,
+      // it has calculated the correct iteration time, so we have run that very
+      // iteration count just now. No need to calculate anything. Just report.
+      // Else, the normal rules apply.
+      const bool results_are_significant = !is_the_first_repetition ||
+                                           has_explicit_iteration_count ||
+                                           ShouldReportIterationResults(i);
+
+      if (results_are_significant) break;  // Good, let's report them!
+
+      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+      // iteration count, and run the benchmark again...
+
+      iters = PredictNumItersNeeded(i);
+      assert(iters > i.iters &&
+             "if we did more iterations than we want to do the next time, "
+             "then we should have accepted the current iteration run.");
+    }
+
+    // Oh, one last thing, we need to also produce the 'memory measurements'..
+    MemoryManager::Result memory_result;
+    IterationCount memory_iterations = 0;
+    if (memory_manager != nullptr) {
+      // Only run a few iterations to reduce the impact of one-time
+      // allocations in benchmarks that are not properly managed.
+      memory_iterations = std::min<IterationCount>(16, iters);
+      memory_manager->Start();
+      std::unique_ptr<internal::ThreadManager> manager;
+      manager.reset(new internal::ThreadManager(1));
+      RunInThread(&b, memory_iterations, 0, manager.get());
+      manager->WaitForAllThreads();
+      manager.reset();
+
+      memory_manager->Stop(&memory_result);
+    }
+
+    // Ok, now actualy report.
+    BenchmarkReporter::Run report =
+        CreateRunReport(b, i.results, memory_iterations, memory_result,
+                        i.seconds, repetition_index);
+
+    if (!report.error_occurred && b.complexity != oNone)
+      complexity_reports.push_back(report);
+
+    run_results.non_aggregates.push_back(report);
+  }
+};
+
+}  // end namespace
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
+  internal::BenchmarkRunner r(b, complexity_reports);
+  return r.get_results();
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
new file mode 100644
index 0000000..96e8282
--- /dev/null
+++ b/src/benchmark_runner.h
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+DECLARE_double(benchmark_min_time);
+
+DECLARE_int32(benchmark_repetitions);
+
+DECLARE_bool(benchmark_report_aggregates_only);
+
+DECLARE_bool(benchmark_display_aggregates_only);
+
+namespace benchmark {
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports);
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/src/check.h b/src/check.h
index 4572bab..f5f8253 100644
--- a/src/check.h
+++ b/src/check.h
@@ -1,6 +1,7 @@
 #ifndef CHECK_H_
 #define CHECK_H_
 
+#include <cmath>
 #include <cstdlib>
 #include <ostream>
 
@@ -13,55 +14,56 @@ namespace internal {
 typedef void(AbortHandlerT)();
 
 inline AbortHandlerT*& GetAbortHandler() {
-    static AbortHandlerT* handler = &std::abort;
-    return handler;
+  static AbortHandlerT* handler = &std::abort;
+  return handler;
 }
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
-    GetAbortHandler()();
-    std::abort(); // fallback to enforce noreturn
+  GetAbortHandler()();
+  std::abort();  // fallback to enforce noreturn
 }
 
 // CheckHandler is the class constructed by failing CHECK macros. CheckHandler
 // will log information about the failures and abort when it is destructed.
 class CheckHandler {
-public:
+ public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
-    : log_(GetErrorLogInstance())
-  {
-    log_ << file << ":" << line << ": " << func << ": Check `"
-          << check << "' failed. ";
+      : log_(GetErrorLogInstance()) {
+    log_ << file << ":" << line << ": " << func << ": Check `" << check
+         << "' failed. ";
   }
 
-  std::ostream& GetLog() {
-    return log_;
-  }
+  LogType& GetLog() { return log_; }
 
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-      log_ << std::endl;
-      CallAbortHandler();
+    log_ << std::endl;
+    CallAbortHandler();
   }
 
-  CheckHandler & operator=(const CheckHandler&) = delete;
+  CheckHandler& operator=(const CheckHandler&) = delete;
   CheckHandler(const CheckHandler&) = delete;
   CheckHandler() = delete;
-private:
-  std::ostream& log_;
+
+ private:
+  LogType& log_;
 };
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
 
 // The CHECK macro returns a std::ostream object that can have extra information
 // written to it.
 #ifndef NDEBUG
-# define CHECK(b)  (b ? ::benchmark::internal::GetNullLogInstance()        \
-                      : ::benchmark::internal::CheckHandler(               \
-                          #b, __FILE__, __func__, __LINE__).GetLog())
+#define CHECK(b)                                                             \
+  (b ? ::benchmark::internal::GetNullLogInstance()                           \
+     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+           .GetLog())
 #else
-# define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
+// clang-format off
+// preserve whitespacing between operators for alignment
 #define CHECK_EQ(a, b) CHECK((a) == (b))
 #define CHECK_NE(a, b) CHECK((a) != (b))
 #define CHECK_GE(a, b) CHECK((a) >= (b))
@@ -69,4 +71,12 @@ private:
 #define CHECK_GT(a, b) CHECK((a) > (b))
 #define CHECK_LT(a, b) CHECK((a) < (b))
 
+#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
+#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
+#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
+#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
+#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
+#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+//clang-format on
+
 #endif  // CHECK_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index efb8626..fff6a98 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -16,19 +16,20 @@
 
 #include <cstdarg>
 #include <cstdio>
-#include <cstdarg>
-#include <string>
+#include <cstdlib>
+#include <cstring>
 #include <memory>
+#include <string>
 
-#include "commandlineflags.h"
 #include "check.h"
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
-#endif
-
-DECLARE_bool(color_print);
+#include <windows.h>
+#include <io.h>
+#else
+#include <unistd.h>
+#endif  // BENCHMARK_OS_WINDOWS
 
 namespace benchmark {
 namespace {
@@ -81,35 +82,35 @@ PlatformColorCode GetPlatformColorCode(LogColor color) {
 
 }  // end namespace
 
-std::string FormatString(const char *msg, va_list args) {
+std::string FormatString(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
 
   std::size_t size = 256;
   char local_buff[256];
-  auto ret = std::vsnprintf(local_buff, size, msg, args_cp);
+  auto ret = vsnprintf(local_buff, size, msg, args_cp);
 
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
   CHECK(ret >= 0);
 
-  if (ret == 0) // handle empty expansion
+  if (ret == 0)  // handle empty expansion
     return {};
   else if (static_cast<size_t>(ret) < size)
     return local_buff;
   else {
     // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1; // + 1 for the null byte
+    size = (size_t)ret + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
-    ret = std::vsnprintf(buff.get(), size, msg, args);
+    ret = vsnprintf(buff.get(), size, msg, args);
     CHECK(ret > 0 && ((size_t)ret) < size);
     return buff.get();
   }
 }
 
-std::string FormatString(const char *msg, ...) {
+std::string FormatString(const char* msg, ...) {
   va_list args;
   va_start(args, msg);
   auto tmp = FormatString(msg, args);
@@ -120,14 +121,15 @@ std::string FormatString(const char *msg, ...) {
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
+  ColorPrintf(out, color, fmt, args);
+  va_end(args);
+}
 
-  if (!FLAGS_color_print) {
-    out << FormatString(fmt, args);
-    va_end(args);
-    return;
-  }
-
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args) {
 #ifdef BENCHMARK_OS_WINDOWS
+  ((void)out);  // suppress unused warning
+
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -151,8 +153,36 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
   if (color_code) out << FormatString("\033[0;3%sm", color_code);
   out << FormatString(fmt, args) << "\033[m";
 #endif
+}
 
-  va_end(args);
+bool IsColorTerminal() {
+#if BENCHMARK_OS_WINDOWS
+  // On Windows the TERM variable is usually not set, but the
+  // console there does support colors.
+  return 0 != _isatty(_fileno(stdout));
+#else
+  // On non-Windows platforms, we rely on the TERM variable. This list of
+  // supported TERM values is copied from Google Test:
+  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  const char* const SUPPORTED_TERM_VALUES[] = {
+      "xterm",         "xterm-color",     "xterm-256color",
+      "screen",        "screen-256color", "tmux",
+      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
+      "linux",         "cygwin",
+  };
+
+  const char* const term = getenv("TERM");
+
+  bool term_supports_color = false;
+  for (const char* candidate : SUPPORTED_TERM_VALUES) {
+    if (term && 0 == strcmp(term, candidate)) {
+      term_supports_color = true;
+      break;
+    }
+  }
+
+  return 0 != isatty(fileno(stdout)) && term_supports_color;
+#endif  // BENCHMARK_OS_WINDOWS
 }
 
 }  // end namespace benchmark
diff --git a/src/colorprint.h b/src/colorprint.h
index 2b3c082..9f6fab9 100644
--- a/src/colorprint.h
+++ b/src/colorprint.h
@@ -2,8 +2,8 @@
 #define BENCHMARK_COLORPRINT_H_
 
 #include <cstdarg>
-#include <string>
 #include <iostream>
+#include <string>
 
 namespace benchmark {
 enum LogColor {
@@ -20,8 +20,14 @@ enum LogColor {
 std::string FormatString(const char* msg, va_list args);
 std::string FormatString(const char* msg, ...);
 
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args);
 void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
 
+// Returns true if stdout appears to be a terminal that supports colored
+// output, false otherwise.
+bool IsColorTerminal();
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COLORPRINT_H_
diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc
index 3e9a37a..0648fe3 100644
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@@ -14,12 +14,16 @@
 
 #include "commandlineflags.h"
 
+#include <algorithm>
+#include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <limits>
 
 namespace benchmark {
+namespace {
+
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
@@ -43,8 +47,8 @@ bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
       // The parsed value overflows as a long.  (strtol() returns
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
-          // The parsed value overflows as an Int32.
-      ) {
+      // The parsed value overflows as an Int32.
+  ) {
     std::cerr << src_text << " is expected to be a 32-bit integer, "
               << "but actually has value \"" << str << "\", "
               << "which overflows.\n";
@@ -74,17 +78,6 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
   return true;
 }
 
-inline const char* GetEnv(const char* name) {
-#if defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (nullptr).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != nullptr && env[0] != '\0') ? env : nullptr;
-#else
-  return getenv(name);
-#endif
-}
-
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -95,46 +88,45 @@ static std::string FlagToEnvVar(const char* flag) {
   for (size_t i = 0; i != flag_str.length(); ++i)
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
 
-  return "BENCHMARK_" + env_var;
+  return env_var;
 }
 
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
+}  // namespace
+
+bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = GetEnv(env_var.c_str());
-  return string_value == nullptr ? default_value : strcmp(string_value, "0") != 0;
+  const char* const value_str = getenv(env_var.c_str());
+  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = GetEnv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
+  const char* const value_str = getenv(env_var.c_str());
+  int32_t value = default_val;
+  if (value_str == nullptr ||
+      !ParseInt32(std::string("Environment variable ") + env_var, value_str,
+                  &value)) {
+    return default_val;
   }
+  return value;
+}
 
-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
+double DoubleFromEnv(const char* flag, double default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  double value = default_val;
+  if (value_str == nullptr ||
+      !ParseDouble(std::string("Environment variable ") + env_var, value_str,
+                   &value)) {
+    return default_val;
   }
-
-  return result;
+  return value;
 }
 
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
+const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = GetEnv(env_var.c_str());
-  return value == nullptr ? default_value : value;
+  const char* const value = getenv(env_var.c_str());
+  return value == nullptr ? default_val : value;
 }
 
 // Parses a string as a command line flag.  The string should have
@@ -175,7 +167,7 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   if (value_str == nullptr) return false;
 
   // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  *value = IsTruthyFlagValue(value_str);
   return true;
 }
 
@@ -217,4 +209,20 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
+
+bool IsTruthyFlagValue(const std::string& value) {
+  if (value.size() == 1) {
+    char v = value[0];
+    return isalnum(v) &&
+           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
+  } else if (!value.empty()) {
+    std::string value_lower(value);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
+    return !(value_lower == "false" || value_lower == "no" ||
+             value_lower == "off");
+  } else
+    return true;
+}
+
 }  // end namespace benchmark
diff --git a/src/commandlineflags.h b/src/commandlineflags.h
index 34b9c6f..3a1f6a8 100644
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@@ -10,36 +10,57 @@
 // Macros for declaring flags.
 #define DECLARE_bool(name) extern bool FLAG(name)
 #define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
 #define DECLARE_double(name) extern double FLAG(name)
 #define DECLARE_string(name) extern std::string FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
+#define DEFINE_bool(name, default_val)            \
+  bool FLAG(name) =                               \
+    benchmark::BoolFromEnv(#name, default_val)
+#define DEFINE_int32(name, default_val)           \
+  int32_t FLAG(name) =                            \
+    benchmark::Int32FromEnv(#name, default_val)
+#define DEFINE_double(name, default_val)          \
+  double FLAG(name) =                             \
+    benchmark::DoubleFromEnv(#name, default_val)
+#define DEFINE_string(name, default_val)          \
+  std::string FLAG(name) =                        \
+    benchmark::StringFromEnv(#name, default_val)
 
 namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
 
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
+// Parses a bool from the environment variable
+// corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
 bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable
+// corresponding to the given flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable
+// corresponding to the given flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
 double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable
+// corresponding to the given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
 const char* StringFromEnv(const char* flag, const char* default_val);
 
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
+// In the former case, the value is taken as true if it passes IsTruthyValue().
 //
 // In the latter case, the value is taken as true.
 //
@@ -71,6 +92,12 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 // Returns true if the string matches the flag.
 bool IsFlag(const char* str, const char* flag);
 
+// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
+bool IsTruthyFlagValue(const std::string& value);
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/src/complexity.cc b/src/complexity.cc
index b42bd38..aeed67f 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -15,32 +15,37 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #include <algorithm>
 #include <cmath>
 #include "check.h"
 #include "complexity.h"
-#include "stat.h"
 
 namespace benchmark {
 
 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
+  static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
-      return [](int n) -> double { return n; };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
     case oNSquared:
-      return [](int n) -> double { return n * n; };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int n) -> double { return n * n * n; };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
-      return [](int n) { return std::log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
-      return [](int n) { return n * std::log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
     case o1:
     default:
-      return [](int) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
   }
 }
 
@@ -66,15 +71,15 @@ std::string GetBigOString(BigO complexity) {
 
 // Find the coefficient for the high-order term in the running time, by
 // minimizing the sum of squares of relative error, for the fitting curve
-// given by the lambda expresion.
+// given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expresion (e.g. [](int n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
 
-// For a deeper explanation on the algorithm logic, look the README file at
-// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+// For a deeper explanation on the algorithm logic, please refer to
+// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
 
-LeastSq MinimalLeastSq(const std::vector<int>& n,
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
   double sigma_gn = 0.0;
@@ -118,9 +123,8 @@ LeastSq MinimalLeastSq(const std::vector<int>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int>& n,
-                       const std::vector<double>& time,
-                       const BigO complexity) {
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+                       const std::vector<double>& time, const BigO complexity) {
   CHECK_EQ(n.size(), time.size());
   CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
                           // benchmark runs are given
@@ -151,75 +155,6 @@ LeastSq MinimalLeastSq(const std::vector<int>& n,
   return best_fit;
 }
 
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports) {
-  typedef BenchmarkReporter::Run Run;
-  std::vector<Run> results;
-
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
-
-  if (reports.size() - error_count < 2) {
-    // We don't report aggregated data if there was a single run.
-    return results;
-  }
-  // Accumulators.
-  Stat1_d real_accumulated_time_stat;
-  Stat1_d cpu_accumulated_time_stat;
-  Stat1_d bytes_per_second_stat;
-  Stat1_d items_per_second_stat;
-  // All repetitions should be run with the same number of iterations so we
-  // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
-
-  // Populate the accumulators.
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
-    real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time / run.iterations, run.iterations);
-    cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time / run.iterations, run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
-  }
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  Run mean_data;
-  mean_data.benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data.iterations = run_iterations;
-  mean_data.real_accumulated_time =
-      real_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.cpu_accumulated_time =
-      cpu_accumulated_time_stat.Mean() * run_iterations;
-  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
-  mean_data.items_per_second = items_per_second_stat.Mean();
-
-  // Only add label to mean/stddev if it is same for all runs
-  mean_data.report_label = reports[0].report_label;
-  for (std::size_t i = 1; i < reports.size(); i++) {
-    if (reports[i].report_label != reports[0].report_label) {
-      mean_data.report_label = "";
-      break;
-    }
-  }
-
-  Run stddev_data;
-  stddev_data.benchmark_name = reports[0].benchmark_name + "_stddev";
-  stddev_data.report_label = mean_data.report_label;
-  stddev_data.iterations = 0;
-  stddev_data.real_accumulated_time = real_accumulated_time_stat.StdDev();
-  stddev_data.cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
-  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
-  stddev_data.items_per_second = items_per_second_stat.StdDev();
-
-  results.push_back(mean_data);
-  results.push_back(stddev_data);
-  return results;
-}
-
 std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
@@ -228,7 +163,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   if (reports.size() < 2) return results;
 
   // Accumulators.
-  std::vector<int> n;
+  std::vector<int64_t> n;
   std::vector<double> real_time;
   std::vector<double> cpu_time;
 
@@ -250,30 +185,50 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
     result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
   }
-  std::string benchmark_name =
-      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
-  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.run_name = run_name;
+  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
+  big_o.aggregate_name = "BigO";
+  big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
   big_o.cpu_accumulated_time = result_cpu.coef;
   big_o.report_big_o = true;
   big_o.complexity = result_cpu.complexity;
 
+  // All the time results are reported after being multiplied by the
+  // time unit multiplier. But since RMS is a relative quantity it
+  // should not be multiplied at all. So, here, we _divide_ it by the
+  // multiplier so that when it is multiplied later the result is the
+  // correct one.
   double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
 
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
-  big_o.report_label = reports[0].report_label;
-  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.run_name = run_name;
+  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  rms.aggregate_name = "RMS";
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
   rms.real_accumulated_time = result_real.rms / multiplier;
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;
   rms.complexity = result_cpu.complexity;
+  // don't forget to keep the time unit, or we won't be able to
+  // recover the correct value.
+  rms.time_unit = reports[0].time_unit;
 
   results.push_back(big_o);
   results.push_back(rms);
diff --git a/src/complexity.h b/src/complexity.h
index 85cc125..df29b48 100644
--- a/src/complexity.h
+++ b/src/complexity.h
@@ -21,17 +21,10 @@
 #include <string>
 #include <vector>
 
-#include "benchmark/benchmark_api.h"
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 
-// Return a vector containing the mean and standard devation information for
-// the specified list of reports. If 'reports' contains less than two
-// non-errored runs an empty vector is returned
-std::vector<BenchmarkReporter::Run> ComputeStats(
-    const std::vector<BenchmarkReporter::Run>& reports);
-
 // Return a vector containing the bigO and RMS information for the specified
 // list of reports. If 'reports.size() < 2' an empty vector is returned.
 std::vector<BenchmarkReporter::Run> ComputeBigO(
@@ -47,10 +40,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 //                   parameter will return the best fitting curve detected.
 
 struct LeastSq {
-  LeastSq() :
-    coef(0.0),
-    rms(0.0),
-    complexity(oNone) {}
+  LeastSq() : coef(0.0), rms(0.0), complexity(oNone) {}
 
   double coef;
   double rms;
@@ -60,5 +50,6 @@ struct LeastSq {
 // Function to return an string for the calculated complexity
 std::string GetBigOString(BigO complexity);
 
-} // end namespace benchmark
-#endif // COMPLEXITY_H_
+}  // end namespace benchmark
+
+#endif  // COMPLEXITY_H_
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 080c324..6fd7645 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -12,113 +12,166 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "colorprint.h"
 #include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
 #include "internal_macros.h"
 #include "string_util.h"
-#include "walltime.h"
-
-DECLARE_bool(color_print);
+#include "timers.h"
 
 namespace benchmark {
 
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
+  printed_header_ = false;
+  prev_counters_.clear();
 
   PrintBasicContext(&GetErrorStream(), context);
 
 #ifdef BENCHMARK_OS_WINDOWS
-  if (FLAGS_color_print && &std::cout != &GetOutputStream()) {
-      GetErrorStream() << "Color printing is only supported for stdout on windows."
-                          " Disabling color printing\n";
-      FLAGS_color_print = false;
+  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
+    GetErrorStream()
+        << "Color printing is only supported for stdout on windows."
+           " Disabling color printing\n";
+    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
   }
 #endif
-  std::string str = FormatString("%-*s %13s %13s %10s\n",
-                             static_cast<int>(name_field_width_), "Benchmark",
-                             "Time", "CPU", "Iterations");
-  GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";
 
   return true;
 }
 
+void ConsoleReporter::PrintHeader(const Run& run) {
+  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                                 "Benchmark", "Time", "CPU", "Iterations");
+  if(!run.counters.empty()) {
+    if(output_options_ & OO_Tabular) {
+      for(auto const& c : run.counters) {
+        str += FormatString(" %10s", c.first.c_str());
+      }
+    } else {
+      str += " UserCounters...";
+    }
+  }
+  std::string line = std::string(str.length(), '-');
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
+}
+
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports)
+  for (const auto& run : reports) {
+    // print the header:
+    // --- if none was printed yet
+    bool print_header = !printed_header_;
+    // --- or if the format is tabular and this run
+    //     has different fields from the prev header
+    print_header |= (output_options_ & OO_Tabular) &&
+                    (!internal::SameNames(run.counters, prev_counters_));
+    if (print_header) {
+      printed_header_ = true;
+      prev_counters_ = run.counters;
+      PrintHeader(run);
+    }
+    // As an alternative to printing the headers like this, we could sort
+    // the benchmarks by header and then print. But this would require
+    // waiting for the full results before printing, or printing twice.
     PrintRunData(run);
+  }
+}
+
+static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
+                             ...) {
+  va_list args;
+  va_start(args, fmt);
+  out << FormatString(fmt, args);
+  va_end(args);
+}
+
+
+static std::string FormatTime(double time) {
+  // Align decimal places...
+  if (time < 1.0) {
+    return FormatString("%10.3f", time);
+  }
+  if (time < 10.0) {
+    return FormatString("%10.2f", time);
+  }
+  if (time < 100.0) {
+    return FormatString("%10.1f", time);
+  }
+  return FormatString("%10.0f", time);
 }
 
 void ConsoleReporter::PrintRunData(const Run& result) {
+  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-
+  PrinterFn* printer = (output_options_ & OO_Color) ?
+                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
-  ColorPrintf(Out, name_color, "%-*s ", name_field_width_,
-              result.benchmark_name.c_str());
+  printer(Out, name_color, "%-*s ", name_field_width_,
+          result.benchmark_name().c_str());
 
   if (result.error_occurred) {
-    ColorPrintf(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-                result.error_message.c_str());
-    ColorPrintf(Out, COLOR_DEFAULT, "\n");
+    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
+            result.error_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
-                   " items/s");
- }
 
   const double real_time = result.GetAdjustedRealTime();
   const double cpu_time = result.GetAdjustedCPUTime();
+  const std::string real_time_str = FormatTime(real_time);
+  const std::string cpu_time_str = FormatTime(cpu_time);
+
 
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    ColorPrintf(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time,
-                big_o.c_str(), cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
+            cpu_time, big_o.c_str());
   } else if (result.report_rms) {
-    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-                cpu_time * 100);
+    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
+            cpu_time * 100, "%");
   } else {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    ColorPrintf(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-                cpu_time, timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
+            cpu_time_str.c_str(), timeLabel);
   }
 
   if (!result.report_big_o && !result.report_rms) {
-    ColorPrintf(Out, COLOR_CYAN, "%10lld", result.iterations);
-  }
-
-  if (!rate.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
+    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
   }
 
-  if (!items.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
+  for (auto& c : result.counters) {
+    const std::size_t cNameLen = std::max(std::string::size_type(10),
+                                          c.first.length());
+    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const char* unit = "";
+    if (c.second.flags & Counter::kIsRate)
+      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (output_options_ & OO_Tabular) {
+      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
+              unit);
+    } else {
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), unit);
+    }
   }
 
   if (!result.report_label.empty()) {
-    ColorPrintf(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
+    printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
   }
 
-  ColorPrintf(Out, COLOR_DEFAULT, "\n");
+  printer(Out, COLOR_DEFAULT, "\n");
 }
 
 }  // end namespace benchmark
diff --git a/src/counter.cc b/src/counter.cc
new file mode 100644
index 0000000..cf5b78e
--- /dev/null
+++ b/src/counter.cc
@@ -0,0 +1,80 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "counter.h"
+
+namespace benchmark {
+namespace internal {
+
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
+              double num_threads) {
+  double v = c.value;
+  if (c.flags & Counter::kIsRate) {
+    v /= cpu_time;
+  }
+  if (c.flags & Counter::kAvgThreads) {
+    v /= num_threads;
+  }
+  if (c.flags & Counter::kIsIterationInvariant) {
+    v *= iterations;
+  }
+  if (c.flags & Counter::kAvgIterations) {
+    v /= iterations;
+  }
+
+  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+    v = 1.0 / v;
+  }
+  return v;
+}
+
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
+  for (auto& c : *l) {
+    c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
+  }
+}
+
+void Increment(UserCounters* l, UserCounters const& r) {
+  // add counters present in both or just in *l
+  for (auto& c : *l) {
+    auto it = r.find(c.first);
+    if (it != r.end()) {
+      c.second.value = c.second + it->second;
+    }
+  }
+  // add counters present in r, but not in *l
+  for (auto const& tc : r) {
+    auto it = l->find(tc.first);
+    if (it == l->end()) {
+      (*l)[tc.first] = tc.second;
+    }
+  }
+}
+
+bool SameNames(UserCounters const& l, UserCounters const& r) {
+  if (&l == &r) return true;
+  if (l.size() != r.size()) {
+    return false;
+  }
+  for (auto const& c : l) {
+    if (r.find(c.first) == r.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // end namespace internal
+}  // end namespace benchmark
diff --git a/src/re_std.cc b/src/counter.h
index cfd7a21..1f5a58e 100644
--- a/src/re_std.cc
+++ b/src/counter.h
@@ -12,33 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "re.h"
+#ifndef BENCHMARK_COUNTER_H_
+#define BENCHMARK_COUNTER_H_
 
-namespace benchmark {
-
-Regex::Regex() : init_(false) { }
-
-bool Regex::Init(const std::string& spec, std::string* error) {
-  try {
-    re_ = std::regex(spec, std::regex_constants::extended);
-
-    init_ = true;
-  } catch (const std::regex_error& e) {
-    if (error) {
-      *error = e.what();
-    }
-  }
-  return init_;
-}
+#include "benchmark/benchmark.h"
 
-Regex::~Regex() { }
-
-bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
+namespace benchmark {
 
-  return std::regex_search(str, re_);
-}
+// these counter-related functions are hidden to reduce API surface.
+namespace internal {
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
+void Increment(UserCounters* l, UserCounters const& r);
+bool SameNames(UserCounters const& l, UserCounters const& r);
+}  // end namespace internal
 
 }  // end namespace benchmark
+
+#endif  // BENCHMARK_COUNTER_H_
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index 7bc7ef3..af2c18f 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "complexity.h"
 
 #include <algorithm>
@@ -22,8 +22,9 @@
 #include <tuple>
 #include <vector>
 
+#include "check.h"
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
@@ -31,51 +32,80 @@ namespace benchmark {
 
 namespace {
 std::vector<std::string> elements = {
-  "name",
-  "iterations",
-  "real_time",
-  "cpu_time",
-  "time_unit",
-  "bytes_per_second",
-  "items_per_second",
-  "label",
-  "error_occurred",
-  "error_message"
-};
+    "name",           "iterations",       "real_time",        "cpu_time",
+    "time_unit",      "bytes_per_second", "items_per_second", "label",
+    "error_occurred", "error_message"};
+}  // namespace
+
+std::string CsvEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+    case '"' : tmp += "\"\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return '"' + tmp + '"';
 }
 
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
+  return true;
+}
 
+void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
-  for (auto B = elements.begin(); B != elements.end(); ) {
-    Out << *B++;
-    if (B != elements.end())
-      Out << ",";
+
+  if (!printed_header_) {
+    // save the names of all the user counters
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
+        user_counter_names_.insert(cnt.first);
+      }
+    }
+
+    // print the header
+    for (auto B = elements.begin(); B != elements.end();) {
+      Out << *B++;
+      if (B != elements.end()) Out << ",";
+    }
+    for (auto B = user_counter_names_.begin();
+         B != user_counter_names_.end();) {
+      Out << ",\"" << *B++ << "\"";
+    }
+    Out << "\n";
+
+    printed_header_ = true;
+  } else {
+    // check that all the current counters are saved in the name set
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
+        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+            << "All counters must be present in each run. "
+            << "Counter named \"" << cnt.first
+            << "\" was not in a run after being added to the header";
+      }
+    }
   }
-  Out << "\n";
-  return true;
-}
 
-void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
-  for (const auto& run : reports)
+  // print results for each run
+  for (const auto& run : reports) {
     PrintRunData(run);
+  }
 }
 
-void CSVReporter::PrintRunData(const Run & run) {
+void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name;
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
+  Out << CsvEscape(run.benchmark_name()) << ",";
   if (run.error_occurred) {
     Out << std::string(elements.size() - 3, ',');
     Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
+    Out << CsvEscape(run.error_message) << "\n";
     return;
   }
 
@@ -96,22 +126,28 @@ void CSVReporter::PrintRunData(const Run & run) {
   }
   Out << ",";
 
-  if (run.bytes_per_second > 0.0) {
-    Out << run.bytes_per_second;
+  if (run.counters.find("bytes_per_second") != run.counters.end()) {
+    Out << run.counters.at("bytes_per_second");
   }
   Out << ",";
-  if (run.items_per_second > 0.0) {
-    Out << run.items_per_second;
+  if (run.counters.find("items_per_second") != run.counters.end()) {
+    Out << run.counters.at("items_per_second");
   }
   Out << ",";
   if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
+    Out << CsvEscape(run.report_label);
   }
   Out << ",,";  // for error_occurred and error_message
+
+  // Print user counters
+  for (const auto& ucn : user_counter_names_) {
+    auto it = run.counters.find(ucn);
+    if (it == run.counters.end()) {
+      Out << ",";
+    } else {
+      Out << "," << it->second;
+    }
+  }
   Out << '\n';
 }
 
diff --git a/src/cycleclock.h b/src/cycleclock.h
index e4825d4..6843b69 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-#include "benchmark/macros.h"
+#include "benchmark/benchmark.h"
 #include "internal_macros.h"
 
 #if defined(BENCHMARK_OS_MACOSX)
@@ -36,13 +36,18 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
 
-#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
 #include <sys/time.h>
+#include <time.h>
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
 #endif
 
 namespace benchmark {
@@ -65,6 +70,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // counter pauses; it does not continue counting, nor does it
   // reset to zero.
   return mach_absolute_time();
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // this goes above x86-specific code because old versions of Emscripten
+  // define __x86_64__, although they have nothing to do with it.
+  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
 #elif defined(__i386__)
   int64_t ret;
   __asm__ volatile("rdtsc" : "=A"(ret));
@@ -75,13 +84,21 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   return (high << 32) | low;
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftb %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
   int64_t tick;
   asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -97,8 +114,30 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
+#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
+#elif defined(BENCHMARK_OS_NACL)
+  // Native Client validator on x86/x86-64 allows RDTSC instructions,
+  // and this case is handled above. Native Client validator on ARM
+  // rejects MRC instructions (used in the ARM-specific sequence below),
+  // so we handle it here. Portable Native Client compiles to
+  // architecture-agnostic bytecode, which doesn't provide any
+  // cycle counter access mnemonics.
+
+  // Native Client does not provide any API to access cycle counter.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution (which is noticable at
+  // least for PNaCl modules running on x86 Mac & Linux).
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
   // System timer of ARMv8 runs at a different frequency than the CPU's.
   // The frequency is fixed, typically in the range 1-50MHz.  It can be
@@ -108,7 +147,9 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
   return virtual_timer_value;
 #elif defined(__ARM_ARCH)
-#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+#if (__ARM_ARCH >= 6)
   uint32_t pmccntr;
   uint32_t pmuseren;
   uint32_t pmcntenset;
@@ -126,12 +167,43 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
   // mips apparently only allows rdtsc for superusers, so we fall
   // back to gettimeofday.  It's possible clock_gettime would be better.
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__s390__)  // Covers both s390 and s390x.
+  // Return the CPU clock.
+  uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
+  asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
+  return tsc;
+#elif defined(__riscv) // RISC-V
+  // Use RDCYCLE (and RDCYCLEH on riscv32)
+#if __riscv_xlen == 32
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+#else
+  uint64_t cycles;
+  asm volatile("rdcycle %0" : "=r"(cycles));
+  return cycles;
+#endif
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
diff --git a/src/internal_macros.h b/src/internal_macros.h
index 1080ac9..91f367b 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -1,40 +1,102 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/macros.h"
+#include "benchmark/benchmark.h"
+
+/* Needed to detect STL */
+#include <cstdlib>
+
+// clang-format off
 
 #ifndef __has_feature
-# define __has_feature(x) 0
+#define __has_feature(x) 0
+#endif
+
+#if defined(__clang__)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
+    #define COMPILER_CLANG
+  #endif
+#elif defined(_MSC_VER)
+  #if !defined(COMPILER_MSVC)
+    #define COMPILER_MSVC
+  #endif
+#elif defined(__GNUC__)
+  #if !defined(COMPILER_GCC)
+    #define COMPILER_GCC
+  #endif
 #endif
 
 #if __has_feature(cxx_attributes)
-# define BENCHMARK_NORETURN [[noreturn]]
+  #define BENCHMARK_NORETURN [[noreturn]]
 #elif defined(__GNUC__)
-# define BENCHMARK_NORETURN __attribute__((noreturn))
+  #define BENCHMARK_NORETURN __attribute__((noreturn))
+#elif defined(COMPILER_MSVC)
+  #define BENCHMARK_NORETURN __declspec(noreturn)
 #else
-# define BENCHMARK_NORETURN
+  #define BENCHMARK_NORETURN
 #endif
 
 #if defined(__CYGWIN__)
-# define BENCHMARK_OS_CYGWIN 1
+  #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
-# define BENCHMARK_OS_WINDOWS 1
+  #define BENCHMARK_OS_WINDOWS 1
+  #if defined(__MINGW32__)
+    #define BENCHMARK_OS_MINGW 1
+  #endif
 #elif defined(__APPLE__)
-// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
-// that it is an apple system.
-# define BENCHMARK_OS_MACOSX 1
+  #define BENCHMARK_OS_APPLE 1
+  #include "TargetConditionals.h"
+  #if defined(TARGET_OS_MAC)
+    #define BENCHMARK_OS_MACOSX 1
+    #if defined(TARGET_OS_IPHONE)
+      #define BENCHMARK_OS_IOS 1
+    #endif
+  #endif
 #elif defined(__FreeBSD__)
-# define BENCHMARK_OS_FREEBSD 1
+  #define BENCHMARK_OS_FREEBSD 1
+#elif defined(__NetBSD__)
+  #define BENCHMARK_OS_NETBSD 1
+#elif defined(__OpenBSD__)
+  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
-# define BENCHMARK_OS_LINUX 1
+  #define BENCHMARK_OS_LINUX 1
+#elif defined(__native_client__)
+  #define BENCHMARK_OS_NACL 1
+#elif defined(__EMSCRIPTEN__)
+  #define BENCHMARK_OS_EMSCRIPTEN 1
+#elif defined(__rtems__)
+  #define BENCHMARK_OS_RTEMS 1
+#elif defined(__Fuchsia__)
+#define BENCHMARK_OS_FUCHSIA 1
+#elif defined (__SVR4) && defined (__sun)
+#define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
 #endif
 
-#if defined(__clang__)
-# define COMPILER_CLANG
-#elif defined(_MSC_VER)
-# define COMPILER_MSVC
-#elif defined(__GNUC__)
-# define COMPILER_GCC
+#if defined(__ANDROID__) && defined(__GLIBCXX__)
+#define BENCHMARK_STL_ANDROID_GNUSTL 1
+#endif
+
+#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
+     && !defined(__EXCEPTIONS)
+  #define BENCHMARK_HAS_NO_EXCEPTIONS
 #endif
 
-#endif // BENCHMARK_INTERNAL_MACROS_H_
+#if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
+  #define BENCHMARK_MAYBE_UNUSED __attribute__((unused))
+#else
+  #define BENCHMARK_MAYBE_UNUSED
+#endif
+
+// clang-format on
+
+#endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 485d305..959d245 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -12,46 +12,89 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "complexity.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
+#include <iomanip>  // for setprecision
 #include <iostream>
+#include <limits>
 #include <string>
 #include <tuple>
 #include <vector>
 
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"
 
 namespace benchmark {
 
 namespace {
 
+std::string StrEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+    case '\b': tmp += "\\b"; break;
+    case '\f': tmp += "\\f"; break;
+    case '\n': tmp += "\\n"; break;
+    case '\r': tmp += "\\r"; break;
+    case '\t': tmp += "\\t"; break;
+    case '\\': tmp += "\\\\"; break;
+    case '"' : tmp += "\\\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return tmp;
+}
+
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value.c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StringPrintF("\"%s\": \"%s\"", key.c_str(), value);
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StringPrintF("\"%s\": %s", key.c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
   std::stringstream ss;
-  ss << '"' << key << "\": " << value;
+  ss << '"' << StrEscape(key) << "\": " << value;
   return ss.str();
 }
 
-int64_t RoundDouble(double v) {
-    return static_cast<int64_t>(v + 0.5);
+std::string FormatKV(std::string const& key, IterationCount value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
 }
 
-} // end namespace
+std::string FormatKV(std::string const& key, double value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": ";
+
+  if (std::isnan(value))
+    ss << (value < 0 ? "-" : "") << "NaN";
+  else if (std::isinf(value))
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
+  return ss.str();
+}
+
+int64_t RoundDouble(double v) { return std::lround(v); }
+
+}  // end namespace
 
 bool JSONReporter::ReportContext(const Context& context) {
   std::ostream& out = GetOutputStream();
@@ -66,15 +109,50 @@ bool JSONReporter::ReportContext(const Context& context) {
   std::string walltime_value = LocalDateTimeString();
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
-  out << indent
-      << FormatKV("num_cpus", static_cast<int64_t>(context.num_cpus))
-      << ",\n";
-  out << indent
-      << FormatKV("mhz_per_cpu", RoundDouble(context.mhz_per_cpu))
+  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
+
+  if (Context::executable_name) {
+    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
+  }
+
+  CPUInfo const& info = context.cpu_info;
+  out << indent << FormatKV("num_cpus", static_cast<int64_t>(info.num_cpus))
       << ",\n";
   out << indent
-      << FormatKV("cpu_scaling_enabled", context.cpu_scaling_enabled)
+      << FormatKV("mhz_per_cpu",
+                  RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+        << ",\n";
+  }
+
+  out << indent << "\"caches\": [\n";
+  indent = std::string(6, ' ');
+  std::string cache_indent(8, ' ');
+  for (size_t i = 0; i < info.caches.size(); ++i) {
+    auto& CI = info.caches[i];
+    out << indent << "{\n";
+    out << cache_indent << FormatKV("type", CI.type) << ",\n";
+    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
+        << ",\n";
+    out << cache_indent
+        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent
+        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
+        << "\n";
+    out << indent << "}";
+    if (i != info.caches.size() - 1) out << ",";
+    out << "\n";
+  }
+  indent = std::string(4, ' ');
+  out << indent << "],\n";
+  out << indent << "\"load_avg\": [";
+  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
+    out << *it++;
+    if (it != info.load_avg.end()) out << ",";
+  }
+  out << "],\n";
 
 #if defined(NDEBUG)
   const char build_type[] = "release";
@@ -118,59 +196,58 @@ void JSONReporter::Finalize() {
 void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
-    out << indent
-        << FormatKV("name", run.benchmark_name)
-        << ",\n";
-    if (run.error_occurred) {
-        out << indent
-            << FormatKV("error_occurred", run.error_occurred)
-            << ",\n";
-        out << indent
-            << FormatKV("error_message", run.error_message)
-            << ",\n";
+  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
+  out << indent << FormatKV("run_type", [&run]() -> const char* {
+    switch (run.run_type) {
+      case BenchmarkReporter::Run::RT_Iteration:
+        return "iteration";
+      case BenchmarkReporter::Run::RT_Aggregate:
+        return "aggregate";
     }
+    BENCHMARK_UNREACHABLE();
+  }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
+  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+  }
+  if (run.error_occurred) {
+    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
+    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  }
   if (!run.report_big_o && !run.report_rms) {
-        out << indent
-            << FormatKV("iterations", run.iterations)
-            << ",\n";
-        out << indent
-            << FormatKV("real_time", RoundDouble(run.GetAdjustedRealTime()))
-            << ",\n";
-        out << indent
-            << FormatKV("cpu_time", RoundDouble(run.GetAdjustedCPUTime()));
-        out << ",\n" << indent
-            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+    out << indent << FormatKV("iterations", run.iterations) << ",\n";
+    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
+    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    out << ",\n"
+        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
-    out << indent
-        << FormatKV("cpu_coefficient", RoundDouble(run.GetAdjustedCPUTime()))
+    out << indent << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
         << ",\n";
-    out << indent
-        << FormatKV("real_coefficient", RoundDouble(run.GetAdjustedRealTime()))
+    out << indent << FormatKV("real_coefficient", run.GetAdjustedRealTime())
         << ",\n";
-    out << indent
-            << FormatKV("big_o", GetBigOString(run.complexity))
-            << ",\n";
-        out << indent
-            << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
-    } else if(run.report_rms) {
-        out << indent
-            << FormatKV("rms", RoundDouble(run.GetAdjustedCPUTime()*100))
-            << '%';
-  }
-  if (run.bytes_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("bytes_per_second", RoundDouble(run.bytes_per_second));
+    out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
+    out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_rms) {
+    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
   }
-  if (run.items_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
+
+  for (auto& c : run.counters) {
+    out << ",\n" << indent << FormatKV(c.first, c.second);
   }
+
+  if (run.has_memory_result) {
+    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
+    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+  }
+
   if (!run.report_label.empty()) {
-    out << ",\n"
-        << indent
-        << FormatKV("label", run.report_label);
+    out << ",\n" << indent << FormatKV("label", run.report_label);
   }
   out << '\n';
 }
diff --git a/src/log.cc b/src/log.cc
deleted file mode 100644
index b660309..0000000
--- a/src/log.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "log.h"
-
-#include <iostream>
-
-namespace benchmark {
-namespace internal {
-
-int& LoggingLevelImp() {
-    static int level = 0;
-    return level;
-}
-
-void SetLogLevel(int value) {
-    LoggingLevelImp() = value;
-}
-
-int GetLogLevel() {
-    return LoggingLevelImp();
-}
-
-class NullLogBuffer : public std::streambuf
-{
-public:
-  int overflow(int c) {
-    return c;
-  }
-};
-
-std::ostream& GetNullLogInstance() {
-  static NullLogBuffer log_buff;
-  static std::ostream null_log(&log_buff);
-  return null_log;
-}
-
-std::ostream& GetErrorLogInstance() {
-  return std::clog;
-}
-
-} // end namespace internal
-} // end namespace benchmark
-\ No newline at end of file
diff --git a/src/log.h b/src/log.h
index 3777810..47d0c35 100644
--- a/src/log.h
+++ b/src/log.h
@@ -1,28 +1,74 @@
 #ifndef BENCHMARK_LOG_H_
 #define BENCHMARK_LOG_H_
 
+#include <iostream>
 #include <ostream>
 
+#include "benchmark/benchmark.h"
+
 namespace benchmark {
 namespace internal {
 
-int GetLogLevel();
-void SetLogLevel(int level);
+typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);
+
+class LogType {
+  friend LogType& GetNullLogInstance();
+  friend LogType& GetErrorLogInstance();
+
+  // FIXME: Add locking to output.
+  template <class Tp>
+  friend LogType& operator<<(LogType&, Tp const&);
+  friend LogType& operator<<(LogType&, EndLType*);
+
+ private:
+  LogType(std::ostream* out) : out_(out) {}
+  std::ostream* out_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+};
+
+template <class Tp>
+LogType& operator<<(LogType& log, Tp const& value) {
+  if (log.out_) {
+    *log.out_ << value;
+  }
+  return log;
+}
+
+inline LogType& operator<<(LogType& log, EndLType* m) {
+  if (log.out_) {
+    *log.out_ << m;
+  }
+  return log;
+}
+
+inline int& LogLevel() {
+  static int log_level = 0;
+  return log_level;
+}
 
-std::ostream& GetNullLogInstance();
-std::ostream& GetErrorLogInstance();
+inline LogType& GetNullLogInstance() {
+  static LogType log(nullptr);
+  return log;
+}
+
+inline LogType& GetErrorLogInstance() {
+  static LogType log(&std::clog);
+  return log;
+}
 
-inline std::ostream& GetLogInstanceForLevel(int level) {
-  if (level <= GetLogLevel()) {
+inline LogType& GetLogInstanceForLevel(int level) {
+  if (level <= LogLevel()) {
     return GetErrorLogInstance();
   }
   return GetNullLogInstance();
 }
 
-} // end namespace internal
-} // end namespace benchmark
-
-#define VLOG(x) (::benchmark::internal::GetLogInstanceForLevel(x) \
-                 << "-- LOG(" << x << "): ")
+}  // end namespace internal
+}  // end namespace benchmark
 
-#endif
-\ No newline at end of file
+// clang-format off
+#define VLOG(x)                                                               \
+  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
+                                                                         " ")
+// clang-format on
+#endif
diff --git a/src/mutex.h b/src/mutex.h
index f37ec35..3fac79a 100644
--- a/src/mutex.h
+++ b/src/mutex.h
@@ -1,28 +1,26 @@
 #ifndef BENCHMARK_MUTEX_H_
 #define BENCHMARK_MUTEX_H_
 
-#include <mutex>
 #include <condition_variable>
+#include <mutex>
+
+#include "check.h"
 
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
 #endif
 
-#define CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
 
-#define SCOPED_CAPABILITY \
-  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
 
-#define GUARDED_BY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
 
-#define PT_GUARDED_BY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
   THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
@@ -54,22 +52,18 @@
 #define TRY_ACQUIRE_SHARED(...) \
   THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
   THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
   THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
 
-
 namespace benchmark {
 
 typedef std::condition_variable Condition;
@@ -77,66 +71,85 @@ typedef std::condition_variable Condition;
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
-class CAPABILITY("mutex") Mutex
-{
-public:
+// used directly because they do not provide the required annotations.
+class CAPABILITY("mutex") Mutex {
+ public:
   Mutex() {}
 
   void lock() ACQUIRE() { mut_.lock(); }
   void unlock() RELEASE() { mut_.unlock(); }
-  std::mutex& native_handle() {
-    return mut_;
-  }
-private:
+  std::mutex& native_handle() { return mut_; }
+
+ private:
   std::mutex mut_;
 };
 
-
-class SCOPED_CAPABILITY MutexLock
-{
+class SCOPED_CAPABILITY MutexLock {
   typedef std::unique_lock<std::mutex> MutexLockImp;
-public:
-  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
-  { }
+
+ public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) {}
   ~MutexLock() RELEASE() {}
   MutexLockImp& native_handle() { return ml_; }
-private:
+
+ private:
   MutexLockImp ml_;
 };
 
+class Barrier {
+ public:
+  Barrier(int num_threads) : running_threads_(num_threads) {}
 
-class Notification
-{
-public:
-  Notification() : notified_yet_(false) { }
-
-  void WaitForNotification() const EXCLUDES(mutex_) {
-    MutexLock m_lock(mutex_);
-    auto notified_fn = [this]() REQUIRES(mutex_) {
-                            return this->HasBeenNotified();
-                        };
-    cv_.wait(m_lock.native_handle(), notified_fn);
-  }
-
-  void Notify() EXCLUDES(mutex_) {
+  // Called by each thread
+  bool wait() EXCLUDES(lock_) {
+    bool last_thread = false;
     {
-      MutexLock lock(mutex_);
-      notified_yet_ = 1;
+      MutexLock ml(lock_);
+      last_thread = createBarrier(ml);
     }
-    cv_.notify_all();
+    if (last_thread) phase_condition_.notify_all();
+    return last_thread;
   }
 
-private:
-  bool HasBeenNotified() const REQUIRES(mutex_) {
-    return notified_yet_;
+  void removeThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    --running_threads_;
+    if (entered_ != 0) phase_condition_.notify_all();
   }
 
-  mutable Mutex mutex_;
-  mutable std::condition_variable cv_;
-  bool notified_yet_ GUARDED_BY(mutex_);
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int running_threads_;
+
+  // State for barrier management
+  int phase_number_ = 0;
+  int entered_ = 0;  // Number of threads that have entered this barrier
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_;  // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp) return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
 };
 
-} // end namespace benchmark
+}  // end namespace benchmark
 
-#endif // BENCHMARK_MUTEX_H_
+#endif  // BENCHMARK_MUTEX_H_
diff --git a/src/re.h b/src/re.h
index af57a39..fbe2503 100644
--- a/src/re.h
+++ b/src/re.h
@@ -15,24 +15,53 @@
 #ifndef BENCHMARK_RE_H_
 #define BENCHMARK_RE_H_
 
+#include "internal_macros.h"
+
+// clang-format off
+
+#if !defined(HAVE_STD_REGEX) && \
+    !defined(HAVE_GNU_POSIX_REGEX) && \
+    !defined(HAVE_POSIX_REGEX)
+  // No explicit regex selection; detect based on builtin hints.
+  #if defined(BENCHMARK_OS_LINUX) || defined(BENCHMARK_OS_APPLE)
+    #define HAVE_POSIX_REGEX 1
+  #elif __cplusplus >= 199711L
+    #define HAVE_STD_REGEX 1
+  #endif
+#endif
+
+// Prefer C regex libraries when compiling w/o exceptions so that we can
+// correctly report errors.
+#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
+    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
+  #undef HAVE_STD_REGEX
+#endif
+
 #if defined(HAVE_STD_REGEX)
-#include <regex>
+  #include <regex>
 #elif defined(HAVE_GNU_POSIX_REGEX)
-#include <gnuregex.h>
+  #include <gnuregex.h>
 #elif defined(HAVE_POSIX_REGEX)
-#include <regex.h>
+  #include <regex.h>
 #else
 #error No regular expression backend was found!
 #endif
+
+// clang-format on
+
 #include <string>
 
+#include "check.h"
+
 namespace benchmark {
 
 // A wrapper around the POSIX regular expression API that provides automatic
 // cleanup
 class Regex {
  public:
-  Regex();
+  Regex() : init_(false) {}
+
   ~Regex();
 
   // Compile a regular expression matcher from spec.  Returns true on success.
@@ -43,18 +72,87 @@ class Regex {
 
   // Returns whether str matches the compiled regular expression.
   bool Match(const std::string& str);
+
  private:
   bool init_;
-  // Underlying regular expression object
+// Underlying regular expression object
 #if defined(HAVE_STD_REGEX)
   std::regex re_;
 #elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
   regex_t re_;
 #else
-# error No regular expression backend implementation available
+#error No regular expression backend implementation available
 #endif
 };
 
+#if defined(HAVE_STD_REGEX)
+
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+#ifdef BENCHMARK_HAS_NO_EXCEPTIONS
+  ((void)error);  // suppress unused warning
+#else
+  try {
+#endif
+  re_ = std::regex(spec, std::regex_constants::extended);
+  init_ = true;
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+}
+catch (const std::regex_error& e) {
+  if (error) {
+    *error = e.what();
+  }
+}
+#endif
+return init_;
+}
+
+inline Regex::~Regex() {}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return std::regex_search(str, re_);
+}
+
+#else
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    if (error) {
+      size_t needed = regerror(ec, &re_, nullptr, 0);
+      char* errbuf = new char[needed];
+      regerror(ec, &re_, errbuf, needed);
+
+      // regerror returns the number of bytes necessary to null terminate
+      // the string, so we move that when assigning to error.
+      CHECK_NE(needed, 0);
+      error->assign(errbuf, needed - 1);
+
+      delete[] errbuf;
+    }
+
+    return false;
+  }
+
+  init_ = true;
+  return true;
+}
+
+inline Regex::~Regex() {
+  if (init_) {
+    regfree(&re_);
+  }
+}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
+}
+#endif
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_RE_H_
diff --git a/src/re_posix.cc b/src/re_posix.cc
deleted file mode 100644
index 95b086f..0000000
--- a/src/re_posix.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "check.h"
-#include "re.h"
-
-namespace benchmark {
-
-Regex::Regex() : init_(false) { }
-
-bool Regex::Init(const std::string& spec, std::string* error) {
-  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
-  if (ec != 0) {
-    if (error) {
-      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
-
-      // regerror returns the number of bytes necessary to null terminate
-      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
-    }
-
-    return false;
-  }
-
-  init_ = true;
-  return true;
-}
-
-Regex::~Regex() {
-  if (init_) {
-    regfree(&re_);
-  }
-}
-
-bool Regex::Match(const std::string& str) {
-  if (!init_) {
-    return false;
-  }
-
-  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
-}
-
-}  // end namespace benchmark
diff --git a/src/reporter.cc b/src/reporter.cc
index 5187859..337575a 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -12,64 +12,94 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
-#include "walltime.h"
+#include "benchmark/benchmark.h"
+#include "timers.h"
 
 #include <cstdlib>
 
 #include <iostream>
-#include <vector>
 #include <tuple>
+#include <vector>
 
 #include "check.h"
-#include "stat.h"
+#include "string_util.h"
 
 namespace benchmark {
 
 BenchmarkReporter::BenchmarkReporter()
-    : output_stream_(&std::cout), error_stream_(&std::cerr)
-{
-}
+    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
 
-BenchmarkReporter::~BenchmarkReporter() {
-}
+BenchmarkReporter::~BenchmarkReporter() {}
 
-void BenchmarkReporter::PrintBasicContext(std::ostream *out_ptr,
+void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out_ptr) << "cannot be null";
-  auto& Out = *out_ptr;
-
-  Out << "Run on (" << context.num_cpus << " X " << context.mhz_per_cpu
-            << " MHz CPU " << ((context.num_cpus > 1) ? "s" : "") << ")\n";
+  CHECK(out) << "cannot be null";
+  auto &Out = *out;
 
   Out << LocalDateTimeString() << "\n";
 
-  if (context.cpu_scaling_enabled) {
+  if (context.executable_name)
+    Out << "Running " << context.executable_name << "\n";
+
+  const CPUInfo &info = context.cpu_info;
+  Out << "Run on (" << info.num_cpus << " X "
+      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
+      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
+  if (info.caches.size() != 0) {
+    Out << "CPU Caches:\n";
+    for (auto &CInfo : info.caches) {
+      Out << "  L" << CInfo.level << " " << CInfo.type << " "
+          << (CInfo.size / 1024) << " KiB";
+      if (CInfo.num_sharing != 0)
+        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      Out << "\n";
+    }
+  }
+  if (!info.load_avg.empty()) {
+    Out << "Load Average: ";
+    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
+      Out << StrFormat("%.2f", *It++);
+      if (It != info.load_avg.end()) Out << ", ";
+    }
+    Out << "\n";
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
-                 "real time measurements may be noisy and will incur extra "
-                 "overhead.\n";
+           "real time measurements may be noisy and will incur extra "
+           "overhead.\n";
   }
 
 #ifndef NDEBUG
   Out << "***WARNING*** Library was built as DEBUG. Timings may be "
-               "affected.\n";
+         "affected.\n";
 #endif
 }
 
+// No initializer because it's already initialized to NULL.
+const char *BenchmarkReporter::Context::executable_name;
+
+BenchmarkReporter::Context::Context()
+    : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}
+
+std::string BenchmarkReporter::Run::benchmark_name() const {
+  std::string name = run_name.str();
+  if (run_type == RT_Aggregate) {
+    name += "_" + aggregate_name;
+  }
+  return name;
+}
+
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0)
-    new_time /= static_cast<double>(iterations);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
   return new_time;
 }
 
 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
   double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0)
-    new_time /= static_cast<double>(iterations);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
   return new_time;
 }
 
-
-
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/sleep.cc b/src/sleep.cc
index 918abc4..4609d54 100644
--- a/src/sleep.cc
+++ b/src/sleep.cc
@@ -15,12 +15,17 @@
 #include "sleep.h"
 
 #include <cerrno>
+#include <cstdlib>
 #include <ctime>
 
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
+#include <windows.h>
+#endif
+
+#ifdef BENCHMARK_OS_ZOS
+#include <unistd.h>
 #endif
 
 namespace benchmark {
@@ -32,15 +37,27 @@ void SleepForSeconds(double seconds) {
 }
 #else   // BENCHMARK_OS_WINDOWS
 void SleepForMicroseconds(int microseconds) {
+#ifdef BENCHMARK_OS_ZOS
+  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
+  // sleep for the remaining microseconds because usleep() will fail if its
+  // argument is greater than 1000000.
+  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
+  int seconds = sleepTime.quot;
+  while (seconds != 0)
+    seconds = sleep(seconds);
+  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
+    ;
+#else
   struct timespec sleep_time;
   sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
   sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
   while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
     ;  // Ignore signals and wait for the full interval to elapse.
+#endif
 }
 
 void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
 }
 
 void SleepForSeconds(double seconds) {
diff --git a/src/sleep.h b/src/sleep.h
index f1e515c..f98551a 100644
--- a/src/sleep.h
+++ b/src/sleep.h
@@ -1,14 +1,12 @@
 #ifndef BENCHMARK_SLEEP_H_
 #define BENCHMARK_SLEEP_H_
 
-#include <cstdint>
-
 namespace benchmark {
-const int64_t kNumMillisPerSecond = 1000LL;
-const int64_t kNumMicrosPerMilli = 1000LL;
-const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
-const int64_t kNumNanosPerMicro = 1000LL;
-const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+const int kNumMillisPerSecond = 1000;
+const int kNumMicrosPerMilli = 1000;
+const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
+const int kNumNanosPerMicro = 1000;
+const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
 
 void SleepForMilliseconds(int milliseconds);
 void SleepForSeconds(double seconds);
diff --git a/src/stat.h b/src/stat.h
deleted file mode 100644
index c4ecfe8..0000000
--- a/src/stat.h
+++ /dev/null
@@ -1,307 +0,0 @@
-#ifndef BENCHMARK_STAT_H_
-#define BENCHMARK_STAT_H_
-
-#include <cmath>
-#include <limits>
-#include <ostream>
-#include <type_traits>
-
-
-namespace benchmark {
-
-template <typename VType, typename NumType>
-class Stat1;
-
-template <typename VType, typename NumType>
-class Stat1MinMax;
-
-typedef Stat1<float, int64_t> Stat1_f;
-typedef Stat1<double, int64_t> Stat1_d;
-typedef Stat1MinMax<float, int64_t> Stat1MinMax_f;
-typedef Stat1MinMax<double, int64_t> Stat1MinMax_d;
-
-template <typename VType>
-class Vector2;
-template <typename VType>
-class Vector3;
-template <typename VType>
-class Vector4;
-
-template <typename VType, typename NumType>
-class Stat1 {
- public:
-  typedef Stat1<VType, NumType> Self;
-
-  Stat1() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1(const VType &dat) {
-    sum_ = dat;
-    sum_squares_ = Sqr(dat);
-    numsamples_ = 1;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1(const VType &dat, const NumType &w) {
-    sum_ = w * dat;
-    sum_squares_ = w * Sqr(dat);
-    numsamples_ = w;
-  }
-  // Copy operator
-  Stat1(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-  }
-
-  void Clear() {
-    numsamples_ = NumType();
-    sum_squares_ = sum_ = VType();
-  }
-
-  Self &operator=(const Self &stat) {
-    sum_ = stat.sum_;
-    sum_squares_ = stat.sum_squares_;
-    numsamples_ = stat.numsamples_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    sum_ += stat.sum_;
-    sum_squares_ += stat.sum_squares_;
-    numsamples_ += stat.numsamples_;
-    return (*this);
-  }
-  // The operation opposite to +=
-  Self &operator-=(const Self &stat) {
-    sum_ -= stat.sum_;
-    sum_squares_ -= stat.sum_squares_;
-    numsamples_ -= stat.numsamples_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &k) {
-    sum_ *= k;
-    sum_squares_ *= k;
-    numsamples_ *= k;
-    return (*this);
-  }
-
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-
-  // The operation opposite to +
-  Self operator-(const Self &stat) const { return Self(*this) -= stat; }
-
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the total weight of this sample set
-  NumType numSamples() const { return numsamples_; }
-
-  // Return the sum of this sample set
-  VType Sum() const { return sum_; }
-
-  // Return the mean of this sample set
-  VType Mean() const {
-    if (numsamples_ == 0) return VType();
-    return sum_ * (1.0 / numsamples_);
-  }
-
-  // Return the mean of this sample set and compute the standard deviation at
-  // the same time.
-  VType Mean(VType *stddev) const {
-    if (numsamples_ == 0) return VType();
-    VType mean = sum_ * (1.0 / numsamples_);
-    if (stddev) {
-      VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-      *stddev = Sqrt(avg_squares - Sqr(mean));
-    }
-    return mean;
-  }
-
-  // Return the standard deviation of the sample set
-  VType StdDev() const {
-    if (numsamples_ == 0) return VType();
-    VType mean = Mean();
-    VType avg_squares = sum_squares_ * (1.0 / numsamples_);
-    return Sqrt(avg_squares - Sqr(mean));
-  }
-
- private:
-  static_assert(std::is_integral<NumType>::value &&
-                !std::is_same<NumType, bool>::value,
-                "NumType must be an integral type that is not bool.");
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  NumType numsamples_;  // sum of weight[i];
-  VType sum_;           // sum of weight[i]*value[i];
-  VType sum_squares_;   // sum of weight[i]*value[i]^2;
-
-  // Template function used to square a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqr(const SType &dat) {
-    return dat * dat;
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
-    return dat.MulComponents(dat);
-  }
-
-  // Template function used to take the square root of a number.
-  // For a vector we square all components
-  template <typename SType>
-  static inline SType Sqrt(const SType &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    if (dat < 0) return 0;
-    return sqrt(dat);
-  }
-
-  template <typename SType>
-  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector2<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector3<SType>()).Sqrt();
-  }
-
-  template <typename SType>
-  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
-    // Avoid NaN due to imprecision in the calculations
-    return Max(dat, Vector4<SType>()).Sqrt();
-  }
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out, const Stat1<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << "}";
-  return out;
-}
-
-// Stat1MinMax: same as Stat1, but it also
-// keeps the Min and Max values; the "-"
-// operator is disabled because it cannot be implemented
-// efficiently
-template <typename VType, typename NumType>
-class Stat1MinMax : public Stat1<VType, NumType> {
- public:
-  typedef Stat1MinMax<VType, NumType> Self;
-
-  Stat1MinMax() { Clear(); }
-  // Create a sample of value dat and weight 1
-  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Create statistics for all the samples between begin (included)
-  // and end(excluded)
-  explicit Stat1MinMax(const VType *begin, const VType *end) {
-    Clear();
-    for (const VType *item = begin; item < end; ++item) {
-      (*this) += Stat1MinMax(*item);
-    }
-  }
-  // Create a sample of value dat and weight w
-  Stat1MinMax(const VType &dat, const NumType &w)
-      : Stat1<VType, NumType>(dat, w) {
-    max_ = dat;
-    min_ = dat;
-  }
-  // Copy operator
-  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
-    max_ = stat.max_;
-    min_ = stat.min_;
-  }
-
-  void Clear() {
-    Stat1<VType, NumType>::Clear();
-    if (std::numeric_limits<VType>::has_infinity) {
-      min_ = std::numeric_limits<VType>::infinity();
-      max_ = -std::numeric_limits<VType>::infinity();
-    } else {
-      min_ = std::numeric_limits<VType>::max();
-      max_ = std::numeric_limits<VType>::min();
-    }
-  }
-
-  Self &operator=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator=(stat);
-    max_ = stat.max_;
-    min_ = stat.min_;
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self &operator+=(const Self &stat) {
-    this->Stat1<VType, NumType>::operator+=(stat);
-    if (stat.max_ > max_) max_ = stat.max_;
-    if (stat.min_ < min_) min_ = stat.min_;
-    return (*this);
-  }
-  // Multiply the weight of the set of samples by a factor k
-  Self &operator*=(const VType &stat) {
-    this->Stat1<VType, NumType>::operator*=(stat);
-    return (*this);
-  }
-  // Merge statistics from two sample sets.
-  Self operator+(const Self &stat) const { return Self(*this) += stat; }
-  // Multiply the weight of the set of samples by a factor k
-  Self operator*(const VType &k) const { return Self(*this) *= k; }
-
-  // Return the maximal value in this sample set
-  VType Max() const { return max_; }
-  // Return the minimal value in this sample set
-  VType Min() const { return min_; }
-
- private:
-  // The - operation makes no sense with Min/Max
-  // unless we keep the full list of values (but we don't)
-  // make it private, and let it undefined so nobody can call it
-  Self &operator-=(const Self &stat);  // senseless. let it undefined.
-
-  // The operation opposite to -
-  Self operator-(const Self &stat) const;  // senseless. let it undefined.
-
-  // Let i be the index of the samples provided (using +=)
-  // and weight[i],value[i] be the data of sample #i
-  // then the variables have the following meaning:
-  VType max_;  // max of value[i]
-  VType min_;  // min of value[i]
-};
-
-// Useful printing function
-template <typename VType, typename NumType>
-std::ostream &operator<<(std::ostream &out,
-                         const Stat1MinMax<VType, NumType> &s) {
-  out << "{ avg = " << s.Mean() << " std = " << s.StdDev()
-      << " nsamples = " << s.NumSamples() << " min = " << s.Min()
-      << " max = " << s.Max() << "}";
-  return out;
-}
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_STAT_H_
diff --git a/src/statistics.cc b/src/statistics.cc
new file mode 100644
index 0000000..bd5a3d6
--- /dev/null
+++ b/src/statistics.cc
@@ -0,0 +1,193 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "check.h"
+#include "statistics.h"
+
+namespace benchmark {
+
+auto StatisticsSum = [](const std::vector<double>& v) {
+  return std::accumulate(v.begin(), v.end(), 0.0);
+};
+
+double StatisticsMean(const std::vector<double>& v) {
+  if (v.empty()) return 0.0;
+  return StatisticsSum(v) * (1.0 / v.size());
+}
+
+double StatisticsMedian(const std::vector<double>& v) {
+  if (v.size() < 3) return StatisticsMean(v);
+  std::vector<double> copy(v);
+
+  auto center = copy.begin() + v.size() / 2;
+  std::nth_element(copy.begin(), center, copy.end());
+
+  // did we have an odd number of samples?
+  // if yes, then center is the median
+  // it no, then we are looking for the average between center and the value
+  // before
+  if (v.size() % 2 == 1) return *center;
+  auto center2 = copy.begin() + v.size() / 2 - 1;
+  std::nth_element(copy.begin(), center2, copy.end());
+  return (*center + *center2) / 2.0;
+}
+
+// Return the sum of the squares of this sample set
+auto SumSquares = [](const std::vector<double>& v) {
+  return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
+};
+
+auto Sqr = [](const double dat) { return dat * dat; };
+auto Sqrt = [](const double dat) {
+  // Avoid NaN due to imprecision in the calculations
+  if (dat < 0.0) return 0.0;
+  return std::sqrt(dat);
+};
+
+double StatisticsStdDev(const std::vector<double>& v) {
+  const auto mean = StatisticsMean(v);
+  if (v.empty()) return mean;
+
+  // Sample standard deviation is undefined for n = 1
+  if (v.size() == 1) return 0.0;
+
+  const double avg_squares = SumSquares(v) * (1.0 / v.size());
+  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+}
+
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  auto error_count =
+      std::count_if(reports.begin(), reports.end(),
+                    [](Run const& run) { return run.error_occurred; });
+
+  if (reports.size() - error_count < 2) {
+    // We don't report aggregated data if there was a single run.
+    return results;
+  }
+
+  // Accumulators.
+  std::vector<double> real_accumulated_time_stat;
+  std::vector<double> cpu_accumulated_time_stat;
+
+  real_accumulated_time_stat.reserve(reports.size());
+  cpu_accumulated_time_stat.reserve(reports.size());
+
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  const IterationCount run_iterations = reports.front().iterations;
+  // create stats for user counters
+  struct CounterStat {
+    Counter c;
+    std::vector<double> s;
+  };
+  std::map<std::string, CounterStat> counter_stats;
+  for (Run const& r : reports) {
+    for (auto const& cnt : r.counters) {
+      auto it = counter_stats.find(cnt.first);
+      if (it == counter_stats.end()) {
+        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
+        it = counter_stats.find(cnt.first);
+        it->second.s.reserve(reports.size());
+      } else {
+        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+      }
+    }
+  }
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    CHECK_EQ(run_iterations, run.iterations);
+    if (run.error_occurred) continue;
+    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
+    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
+    // user counters
+    for (auto const& cnt : run.counters) {
+      auto it = counter_stats.find(cnt.first);
+      CHECK_NE(it, counter_stats.end());
+      it->second.s.emplace_back(cnt.second);
+    }
+  }
+
+  // Only add label if it is same for all runs
+  std::string report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != report_label) {
+      report_label = "";
+      break;
+    }
+  }
+
+  const double iteration_rescale_factor =
+      double(reports.size()) / double(run_iterations);
+
+  for (const auto& Stat : *reports[0].statistics) {
+    // Get the data from the accumulator to BenchmarkReporter::Run's.
+    Run data;
+    data.run_name = reports[0].run_name;
+    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.threads = reports[0].threads;
+    data.repetitions = reports[0].repetitions;
+    data.repetition_index = Run::no_repetition_index;
+    data.aggregate_name = Stat.name_;
+    data.report_label = report_label;
+
+    // It is incorrect to say that an aggregate is computed over
+    // run's iterations, because those iterations already got averaged.
+    // Similarly, if there are N repetitions with 1 iterations each,
+    // an aggregate will be computed over N measurements, not 1.
+    // Thus it is best to simply use the count of separate reports.
+    data.iterations = reports.size();
+
+    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
+    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
+
+    // We will divide these times by data.iterations when reporting, but the
+    // data.iterations is not nessesairly the scale of these measurements,
+    // because in each repetition, these timers are sum over all the iterations.
+    // And if we want to say that the stats are over N repetitions and not
+    // M iterations, we need to multiply these by (N/M).
+    data.real_accumulated_time *= iteration_rescale_factor;
+    data.cpu_accumulated_time *= iteration_rescale_factor;
+
+    data.time_unit = reports[0].time_unit;
+
+    // user counters
+    for (auto const& kv : counter_stats) {
+      // Do *NOT* rescale the custom counters. They are already properly scaled.
+      const auto uc_stat = Stat.compute_(kv.second.s);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags,
+                       counter_stats[kv.first].c.oneK);
+      data.counters[kv.first] = c;
+    }
+
+    results.push_back(data);
+  }
+
+  return results;
+}
+
+}  // end namespace benchmark
diff --git a/src/statistics.h b/src/statistics.h
new file mode 100644
index 0000000..7eccc85
--- /dev/null
+++ b/src/statistics.h
@@ -0,0 +1,37 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATISTICS_H_
+#define STATISTICS_H_
+
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+
+// Return a vector containing the mean, median and standard devation information
+// (and any user-specified info) for the specified list of reports. If 'reports'
+// contains less than two non-errored runs an empty vector is returned
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+double StatisticsMean(const std::vector<double>& v);
+double StatisticsMedian(const std::vector<double>& v);
+double StatisticsStdDev(const std::vector<double>& v);
+
+}  // end namespace benchmark
+
+#endif  // STATISTICS_H_
diff --git a/src/string_util.cc b/src/string_util.cc
index 30d1305..ac60b55 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -1,11 +1,14 @@
 #include "string_util.h"
 
+#include <array>
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+#include <cerrno>
+#endif
 #include <cmath>
 #include <cstdarg>
-#include <array>
+#include <cstdio>
 #include <memory>
 #include <sstream>
-#include <stdio.h>
 
 #include "arraysize.h"
 
@@ -27,8 +30,6 @@ static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-} // end anonymous namespace
-
 void ToExponentAndMantissa(double val, double thresh, int precision,
                            double one_k, std::string* mantissa,
                            int64_t* exponent) {
@@ -45,6 +46,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
       std::max(thresh, 1.0 / std::pow(10.0, precision));
   const double big_threshold = adjusted_threshold * one_k;
   const double small_threshold = adjusted_threshold;
+  // Values in ]simple_threshold,small_threshold[ will be printed as-is
+  const double simple_threshold = 0.01;
 
   if (val > big_threshold) {
     // Positive powers
@@ -62,14 +65,16 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
     *exponent = 0;
   } else if (val < small_threshold) {
     // Negative powers
-    double scaled = val;
-    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
-      scaled *= one_k;
-      if (scaled >= small_threshold) {
-        mantissa_stream << scaled;
-        *exponent = -static_cast<int64_t>(i + 1);
-        *mantissa = mantissa_stream.str();
-        return;
+    if (val < simple_threshold) {
+      double scaled = val;
+      for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+        scaled *= one_k;
+        if (scaled >= small_threshold) {
+          mantissa_stream << scaled;
+          *exponent = -static_cast<int64_t>(i + 1);
+          *mantissa = mantissa_stream.str();
+          return;
+        }
       }
     }
     mantissa_stream << val;
@@ -96,30 +101,31 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
 }
 
 std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision) {
+                                         int precision, double one_k = 1024.0) {
   std::string mantissa;
   int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
+  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
                         &exponent);
   return mantissa + ExponentToPrefix(exponent, false);
 }
 
+}  // end namespace
+
 void AppendHumanReadable(int n, std::string* str) {
   std::stringstream ss;
   // Round down to the nearest SI prefix.
-  ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
+  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
   *str += ss.str();
 }
 
-std::string HumanReadableNumber(double n) {
+std::string HumanReadableNumber(double n, double one_k) {
   // 1.1 means that figures up to 1.1k should be shown with the next unit down;
   // this softens edge effects.
   // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1);
+  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
 }
 
-std::string StringPrintFImp(const char *msg, va_list args)
-{
+std::string StrFormatImp(const char* msg, va_list args) {
   // we might need a second shot at this, so pre-emptivly make a copy
   va_list args_cp;
   va_copy(args_cp, args);
@@ -128,14 +134,14 @@ std::string StringPrintFImp(const char *msg, va_list args)
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
   std::size_t size = local_buff.size();
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
   auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
-  if (ret == 0)
-    return std::string{};
+  if (ret == 0) return std::string{};
   if (static_cast<std::size_t>(ret) < size)
     return std::string(local_buff.data());
 
@@ -143,27 +149,107 @@ std::string StringPrintFImp(const char *msg, va_list args)
   // add 1 to size to account for null-byte in size cast to prevent overflow
   size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
-  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation in the android-ndk
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
   ret = vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
-std::string StringPrintF(const char* format, ...)
-{
+std::string StrFormat(const char* format, ...) {
   va_list args;
   va_start(args, format);
-  std::string tmp = StringPrintFImp(format, args);
+  std::string tmp = StrFormatImp(format, args);
   va_end(args);
   return tmp;
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to) {
-  std::size_t start = 0;
-  while((start = str->find(from, start)) != std::string::npos) {
-    str->replace(start, from.length(), to);
-    start += to.length();
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const unsigned long result = strtoul(strStart, &strEnd, base);
+
+  const int strtoulErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtoulErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of unsigned long");
+  } else if (strEnd == strStart || strtoulErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+
+int stoi(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const long result = strtol(strStart, &strEnd, base);
+
+  const int strtolErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtolErrno == ERANGE || long(int(result)) != result) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtolErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return int(result);
+}
+
+double stod(const std::string& str, size_t* pos) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const double result = strtod(strStart, &strEnd);
+
+  /* Restore previous errno */
+  const int strtodErrno = errno;
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtodErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtodErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
   }
+  return result;
 }
+#endif
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/src/string_util.h b/src/string_util.h
index b89fef5..09d7b4b 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -1,8 +1,8 @@
 #ifndef BENCHMARK_STRING_UTIL_H_
 #define BENCHMARK_STRING_UTIL_H_
 
-#include <string>
 #include <sstream>
+#include <string>
 #include <utility>
 #include "internal_macros.h"
 
@@ -10,35 +10,50 @@ namespace benchmark {
 
 void AppendHumanReadable(int n, std::string* str);
 
-std::string HumanReadableNumber(double n);
+std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
-std::string StringPrintF(const char* format, ...);
+#if defined(__MINGW32__)
+__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
+#elif defined(__GNUC__)
+__attribute__((format(printf, 1, 2)))
+#endif
+std::string
+StrFormat(const char* format, ...);
 
-inline std::ostream&
-StringCatImp(std::ostream& out) BENCHMARK_NOEXCEPT
-{
+inline std::ostream& StrCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
   return out;
 }
 
-template <class First, class ...Rest>
-inline std::ostream&
-StringCatImp(std::ostream& out, First&& f, Rest&&... rest)
-{
+template <class First, class... Rest>
+inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) {
   out << std::forward<First>(f);
-  return StringCatImp(out, std::forward<Rest>(rest)...);
+  return StrCatImp(out, std::forward<Rest>(rest)...);
 }
 
-template<class ...Args>
-inline std::string StrCat(Args&&... args)
-{
+template <class... Args>
+inline std::string StrCat(Args&&... args) {
   std::ostringstream ss;
-  StringCatImp(ss, std::forward<Args>(args)...);
+  StrCatImp(ss, std::forward<Args>(args)...);
   return ss.str();
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to);
-
-} // end namespace benchmark
-
-#endif // BENCHMARK_STRING_UTIL_H_
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos = nullptr,
+                           int base = 10);
+int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
+double stod(const std::string& str, size_t* pos = nullptr);
+#else
+using std::stoul;
+using std::stoi;
+using std::stod;
+#endif
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_STRING_UTIL_H_
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 3a5d942..b30b4f8 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -12,34 +12,54 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "sysinfo.h"
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
-#include <Windows.h>
-#include <VersionHelpers.h>
+#include <shlwapi.h>
+#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
+#include <versionhelpers.h>
+#include <windows.h>
+#include <codecvt>
 #else
 #include <fcntl.h>
+#ifndef BENCHMARK_OS_FUCHSIA
 #include <sys/resource.h>
-#include <sys/types.h> // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#endif
 #include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
+    defined BENCHMARK_OS_DRAGONFLY
+#define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
 #endif
+#if defined(BENCHMARK_OS_SOLARIS)
+#include <kstat.h>
+#endif
+#if defined(BENCHMARK_OS_QNX)
+#include <sys/syspage.h>
+#endif
 
+#include <algorithm>
+#include <array>
+#include <bitset>
 #include <cerrno>
-#include <cstdio>
+#include <climits>
 #include <cstdint>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <iostream>
+#include <iterator>
 #include <limits>
-#include <mutex>
+#include <memory>
+#include <sstream>
+#include <locale>
+#include <utility>
 
-#include "arraysize.h"
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
@@ -49,196 +69,558 @@
 
 namespace benchmark {
 namespace {
-std::once_flag cpuinfo_init;
-double cpuinfo_cycles_per_second = 1.0;
-int cpuinfo_num_cpus = 1;  // Conservative guess
-std::mutex cputimens_mutex;
-
-#if !defined BENCHMARK_OS_MACOSX
-const int64_t estimate_time_ms = 1000;
-
-// Helper function estimates cycles/sec by observing cycles elapsed during
-// sleep(). Using small sleep time decreases accuracy significantly.
-int64_t EstimateCyclesPerSecond() {
-  const int64_t start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return cycleclock::Now() - start_ticks;
+
+void PrintImp(std::ostream& out) { out << std::endl; }
+
+template <class First, class... Rest>
+void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
+  out << std::forward<First>(f);
+  PrintImp(out, std::forward<Rest>(rest)...);
+}
+
+template <class... Args>
+BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
+  PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::exit(EXIT_FAILURE);
 }
+
+#ifdef BENCHMARK_HAS_SYSCTL
+
+/// ValueUnion - A type used to correctly alias the byte-for-byte output of
+/// `sysctl` with the result type it's to be interpreted as.
+struct ValueUnion {
+  union DataT {
+    uint32_t uint32_value;
+    uint64_t uint64_value;
+    // For correct aliasing of union members from bytes.
+    char bytes[8];
+  };
+  using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
+
+  // The size of the data union member + its trailing array size.
+  size_t Size;
+  DataPtr Buff;
+
+ public:
+  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+
+  explicit ValueUnion(size_t BuffSize)
+      : Size(sizeof(DataT) + BuffSize),
+        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+
+  ValueUnion(ValueUnion&& other) = default;
+
+  explicit operator bool() const { return bool(Buff); }
+
+  char* data() const { return Buff->bytes; }
+
+  std::string GetAsString() const { return std::string(data()); }
+
+  int64_t GetAsInteger() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return static_cast<int32_t>(Buff->uint32_value);
+    else if (Size == sizeof(Buff->uint64_value))
+      return static_cast<int64_t>(Buff->uint64_value);
+    BENCHMARK_UNREACHABLE();
+  }
+
+  uint64_t GetAsUnsigned() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return Buff->uint32_value;
+    else if (Size == sizeof(Buff->uint64_value))
+      return Buff->uint64_value;
+    BENCHMARK_UNREACHABLE();
+  }
+
+  template <class T, int N>
+  std::array<T, N> GetAsArray() {
+    const int ArrSize = sizeof(T) * N;
+    CHECK_LE(ArrSize, Size);
+    std::array<T, N> Arr;
+    std::memcpy(Arr.data(), data(), ArrSize);
+    return Arr;
+  }
+};
+
+ValueUnion GetSysctlImp(std::string const& Name) {
+#if defined BENCHMARK_OS_OPENBSD
+  int mib[2];
+
+  mib[0] = CTL_HW;
+  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+    ValueUnion buff(sizeof(int));
+
+    if (Name == "hw.ncpu") {
+      mib[1] = HW_NCPU;
+    } else {
+      mib[1] = HW_CPUSPEED;
+    }
+
+    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+      return ValueUnion();
+    }
+    return buff;
+  }
+  return ValueUnion();
+#else
+  size_t CurBuffSize = 0;
+  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+    return ValueUnion();
+
+  ValueUnion buff(CurBuffSize);
+  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+    return buff;
+  return ValueUnion();
 #endif
+}
 
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-// Helper function for reading an int from a file. Returns true if successful
-// and the memory location pointed to by value is set to the value read.
-bool ReadIntFromFile(const char* file, long* value) {
-  bool ret = false;
-  int fd = open(file, O_RDONLY);
-  if (fd != -1) {
-    char line[1024];
-    char* err;
-    memset(line, '\0', sizeof(line));
-    CHECK(read(fd, line, sizeof(line) - 1));
-    const long temp_value = strtol(line, &err, 10);
-    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
-      *value = temp_value;
-      ret = true;
+BENCHMARK_MAYBE_UNUSED
+bool GetSysctl(std::string const& Name, std::string* Out) {
+  Out->clear();
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  Out->assign(Buff.data());
+  return true;
+}
+
+template <class Tp,
+          class = typename std::enable_if<std::is_integral<Tp>::value>::type>
+bool GetSysctl(std::string const& Name, Tp* Out) {
+  *Out = 0;
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+  return true;
+}
+
+template <class Tp, size_t N>
+bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = Buff.GetAsArray<Tp, N>();
+  return true;
+}
+#endif
+
+template <class ArgT>
+bool ReadFromFile(std::string const& fname, ArgT* arg) {
+  *arg = ArgT();
+  std::ifstream f(fname.c_str());
+  if (!f.is_open()) return false;
+  f >> *arg;
+  return f.good();
+}
+
+CPUInfo::Scaling CpuScaling(int num_cpus) {
+  // We don't have a valid CPU count, so don't even bother.
+  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+#ifdef BENCHMARK_OS_QNX
+  return CPUInfo::Scaling::UNKNOWN;
+#endif
+#ifndef BENCHMARK_OS_WINDOWS
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  std::string res;
+  for (int cpu = 0; cpu < num_cpus; ++cpu) {
+    std::string governor_file =
+        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
+    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
+  }
+  return CPUInfo::Scaling::DISABLED;
+#endif
+  return CPUInfo::Scaling::UNKNOWN;
+}
+
+int CountSetBitsInCPUMap(std::string Val) {
+  auto CountBits = [](std::string Part) {
+    using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
+    Part = "0x" + Part;
+    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
+    return static_cast<int>(Mask.count());
+  };
+  size_t Pos;
+  int total = 0;
+  while ((Pos = Val.find(',')) != std::string::npos) {
+    total += CountBits(Val.substr(0, Pos));
+    Val = Val.substr(Pos + 1);
+  }
+  if (!Val.empty()) {
+    total += CountBits(Val);
+  }
+  return total;
+}
+
+BENCHMARK_MAYBE_UNUSED
+std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
+  int Idx = 0;
+  while (true) {
+    CPUInfo::CacheInfo info;
+    std::string FPath = StrCat(dir, "index", Idx++, "/");
+    std::ifstream f(StrCat(FPath, "size").c_str());
+    if (!f.is_open()) break;
+    std::string suffix;
+    f >> info.size;
+    if (f.fail())
+      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+    if (f.good()) {
+      f >> suffix;
+      if (f.bad())
+        PrintErrorAndDie(
+            "Invalid cache size format: failed to read size suffix");
+      else if (f && suffix != "K")
+        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
+      else if (suffix == "K")
+        info.size *= 1024;
+    }
+    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", FPath, "type");
+    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    std::string map_str;
+    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    info.num_sharing = CountSetBitsInCPUMap(map_str);
+    res.push_back(info);
+  }
+
+  return res;
+}
+
+#ifdef BENCHMARK_OS_MACOSX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &CacheCounts);
+
+  struct {
+    std::string name;
+    std::string type;
+    int level;
+    uint64_t num_sharing;
+  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
+               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
+               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
+               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
+  for (auto& C : Cases) {
+    int val;
+    if (!GetSysctl(C.name, &val)) continue;
+    CPUInfo::CacheInfo info;
+    info.type = C.type;
+    info.level = C.level;
+    info.size = val;
+    info.num_sharing = static_cast<int>(C.num_sharing);
+    res.push_back(std::move(info));
+  }
+  return res;
+}
+#elif defined(BENCHMARK_OS_WINDOWS)
+std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
+  std::vector<CPUInfo::CacheInfo> res;
+  DWORD buffer_size = 0;
+  using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+  using CInfo = CACHE_DESCRIPTOR;
+
+  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
+  GetLogicalProcessorInformation(nullptr, &buffer_size);
+  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
+                     GetLastError());
+
+  PInfo* it = buff.get();
+  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
+
+  for (; it != end; ++it) {
+    if (it->Relationship != RelationCache) continue;
+    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
+    BitSet B(it->ProcessorMask);
+    // To prevent duplicates, only consider caches where CPU 0 is specified
+    if (!B.test(0)) continue;
+    CInfo* Cache = &it->Cache;
+    CPUInfo::CacheInfo C;
+    C.num_sharing = static_cast<int>(B.count());
+    C.level = Cache->Level;
+    C.size = Cache->Size;
+    switch (Cache->Type) {
+      case CacheUnified:
+        C.type = "Unified";
+        break;
+      case CacheInstruction:
+        C.type = "Instruction";
+        break;
+      case CacheData:
+        C.type = "Data";
+        break;
+      case CacheTrace:
+        C.type = "Trace";
+        break;
+      default:
+        C.type = "Unknown";
+        break;
+    }
+    res.push_back(C);
+  }
+  return res;
+}
+#elif BENCHMARK_OS_QNX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
+  for(int i = 0; i < num; ++i ) {
+    CPUInfo::CacheInfo info;
+    switch (cache->flags){
+      case CACHE_FLAG_INSTR :
+        info.type = "Instruction";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_DATA :
+        info.type = "Data";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_UNIFIED :
+        info.type = "Unified";
+        info.level = 2;
+        break;
+      case CACHE_FLAG_SHARED :
+        info.type = "Shared";
+        info.level = 3;
+        break;
+      default :
+        continue;
+        break;
     }
-    close(fd);
+    info.size = cache->line_size * cache->num_lines;
+    info.num_sharing = 0;
+    res.push_back(std::move(info));
+    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
   }
-  return ret;
+  return res;
 }
 #endif
 
-void InitializeSystemInfo() {
+std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
+#ifdef BENCHMARK_OS_MACOSX
+  return GetCacheSizesMacOSX();
+#elif defined(BENCHMARK_OS_WINDOWS)
+  return GetCacheSizesWindows();
+#elif defined(BENCHMARK_OS_QNX)
+  return GetCacheSizesQNX();
+#else
+  return GetCacheSizesFromKVFS();
+#endif
+}
+
+std::string GetSystemName() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  std::string str;
+  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
+  TCHAR  hostname[COUNT] = {'\0'};
+  DWORD DWCOUNT = COUNT;
+  if (!GetComputerName(hostname, &DWCOUNT))
+    return std::string("");
+#ifndef UNICODE
+  str = std::string(hostname, DWCOUNT);
+#else
+  //Using wstring_convert, Is deprecated in C++17
+  using convert_type = std::codecvt_utf8<wchar_t>;
+  std::wstring_convert<convert_type, wchar_t> converter;
+  std::wstring wStr(hostname, DWCOUNT);
+  str = converter.to_bytes(wStr);
+#endif
+  return str;
+#else // defined(BENCHMARK_OS_WINDOWS)
+#ifndef HOST_NAME_MAX
+#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_NACL)
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_QNX)
+#define HOST_NAME_MAX 154
+#elif defined(BENCHMARK_OS_RTEMS)
+#define HOST_NAME_MAX 256
+#else
+#warning "HOST_NAME_MAX not defined. using 64"
+#define HOST_NAME_MAX 64
+#endif
+#endif // def HOST_NAME_MAX
+  char hostname[HOST_NAME_MAX];
+  int retVal = gethostname(hostname, HOST_NAME_MAX);
+  if (retVal != 0) return std::string("");
+  return std::string(hostname);
+#endif // Catch-all POSIX block.
+}
+
+int GetNumCPUs() {
+#ifdef BENCHMARK_HAS_SYSCTL
+  int NumCPU = -1;
+  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
+  fprintf(stderr, "Err: %s\n", strerror(errno));
+  std::exit(EXIT_FAILURE);
+#elif defined(BENCHMARK_OS_WINDOWS)
+  SYSTEM_INFO sysinfo;
+  // Use memset as opposed to = {} to avoid GCC missing initializer false
+  // positives.
+  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
+  GetSystemInfo(&sysinfo);
+  return sysinfo.dwNumberOfProcessors;  // number of logical
+                                        // processors in the current
+                                        // group
+#elif defined(BENCHMARK_OS_SOLARIS)
+  // Returns -1 in case of a failure.
+  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
+  if (NumCPU < 0) {
+    fprintf(stderr,
+            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+            strerror(errno));
+  }
+  return NumCPU;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<int>(_syspage_ptr->num_cpu);
+#else
+  int NumCPUs = 0;
+  int MaxID = -1;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return -1;
+  }
+  const std::string Key = "processor";
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+#if defined(__s390__)
+    // s390 has another format in /proc/cpuinfo
+    // it needs to be parsed differently
+    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+#else
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+#endif
+    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
+      NumCPUs++;
+      if (!value.empty()) {
+        int CurID = benchmark::stoi(value);
+        MaxID = std::max(CurID, MaxID);
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return -1;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return -1;
+  }
+  f.close();
+
+  if ((MaxID + 1) != NumCPUs) {
+    fprintf(stderr,
+            "CPU ID assignments in /proc/cpuinfo seem messed up."
+            " This is usually caused by a bad BIOS.\n");
+  }
+  return NumCPUs;
+#endif
+  BENCHMARK_UNREACHABLE();
+}
+
+double GetCPUCyclesPerSecond() {
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  char line[1024];
-  char* err;
   long freq;
 
-  bool saw_mhz = false;
-
   // If the kernel is exporting the tsc frequency use that. There are issues
   // where cpuinfo_max_freq cannot be relied on because the BIOS may be
   // exporintg an invalid p-state (on x86) or p-states may be used to put the
   // processor in a new mode (turbo mode). Essentially, those frequencies
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
+      // If CPU scaling is in effect, we want to use the *maximum* frequency,
+      // not whatever CPU speed some random processor happens to be using now.
+      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
     // 2GHz warpstation, the file contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
+    return freq * 1000.0;
   }
 
-  // If CPU scaling is in effect, we want to use the *maximum* frequency,
-  // not whatever CPU speed some random processor happens to be using now.
-  if (!saw_mhz &&
-      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                      &freq)) {
-    // The value is in kHz.  For example, on a 2GHz warpstation, the file
-    // contains the value "2000000".
-    cpuinfo_cycles_per_second = freq * 1000.0;
-    saw_mhz = true;
-  }
+  const double error_value = -1;
+  double bogo_clock = error_value;
 
-  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
-  const char* pname = "/proc/cpuinfo";
-  int fd = open(pname, O_RDONLY);
-  if (fd == -1) {
-    perror(pname);
-    if (!saw_mhz) {
-      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-    }
-    return;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return error_value;
   }
 
-  double bogo_clock = 1.0;
-  bool saw_bogo = false;
-  long max_cpu_id = 0;
-  int num_cpus = 0;
-  line[0] = line[1] = '\0';
-  size_t chars_read = 0;
-  do {  // we'll exit when the last read didn't read anything
-    // Move the next line to the beginning of the buffer
-    const size_t oldlinelen = strlen(line);
-    if (sizeof(line) == oldlinelen + 1)  // oldlinelen took up entire line
-      line[0] = '\0';
-    else  // still other lines left to save
-      memmove(line, line + oldlinelen + 1, sizeof(line) - (oldlinelen + 1));
-    // Terminate the new line, reading more if we can't find the newline
-    char* newline = strchr(line, '\n');
-    if (newline == nullptr) {
-      const size_t linelen = strlen(line);
-      const size_t bytes_to_read = sizeof(line) - 1 - linelen;
-      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
-      chars_read = read(fd, line + linelen, bytes_to_read);
-      line[linelen + chars_read] = '\0';
-      newline = strchr(line, '\n');
-    }
-    if (newline != nullptr) *newline = '\0';
-
+  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+    if (Key.size() > Value.size()) return false;
+    auto Cmp = [&](char X, char Y) {
+      return std::tolower(X) == std::tolower(Y);
+    };
+    return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp);
+  };
+
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
-    // accept postive values. Some environments (virtual machines) report zero,
+    // accept positive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz") - 1) == 0) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        cpuinfo_cycles_per_second = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
-          saw_mhz = true;
-      }
-    } else if (strncasecmp(line, "bogomips", sizeof("bogomips") - 1) == 0) {
-      const char* freqstr = strchr(line, ':');
-      if (freqstr) {
-        bogo_clock = strtod(freqstr + 1, &err) * 1000000.0;
-        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
-          saw_bogo = true;
+    if (startsWithKey(ln, "cpu MHz")) {
+      if (!value.empty()) {
+        double cycles_per_second = benchmark::stod(value) * 1000000.0;
+        if (cycles_per_second > 0) return cycles_per_second;
       }
-    } else if (strncmp(line, "processor", sizeof("processor") - 1) == 0) {
-      // The above comparison is case-sensitive because ARM kernels often
-      // include a "Processor" line that tells you about the CPU, distinct
-      // from the usual "processor" lines that give you CPU ids. No current
-      // Linux architecture is using "Processor" for CPU ids.
-      num_cpus++;  // count up every time we see an "processor :" entry
-      const char* id_str = strchr(line, ':');
-      if (id_str) {
-        const long cpu_id = strtol(id_str + 1, &err, 10);
-        if (id_str[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
-          max_cpu_id = cpu_id;
+    } else if (startsWithKey(ln, "bogomips")) {
+      if (!value.empty()) {
+        bogo_clock = benchmark::stod(value) * 1000000.0;
+        if (bogo_clock < 0.0) bogo_clock = error_value;
       }
     }
-  } while (chars_read > 0);
-  close(fd);
-
-  if (!saw_mhz) {
-    if (saw_bogo) {
-      // If we didn't find anything better, we'll use bogomips, but
-      // we're not happy about it.
-      cpuinfo_cycles_per_second = bogo_clock;
-    } else {
-      // If we don't even have bogomips, we'll use the slow estimation.
-      cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-    }
   }
-  if (num_cpus == 0) {
-    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
-  } else {
-    if ((max_cpu_id + 1) != num_cpus) {
-      fprintf(stderr,
-              "CPU ID assignments in /proc/cpuinfo seem messed up."
-              " This is usually caused by a bad BIOS.\n");
-    }
-    cpuinfo_num_cpus = num_cpus;
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return error_value;
   }
-
-#elif defined BENCHMARK_OS_FREEBSD
-// For this sysctl to work, the machine must be configured without
-// SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
-// and later.  Before that, it's a 32-bit quantity (and gives the
-// wrong answer on machines faster than 2^32 Hz).  See
-//  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
-// But also compare FreeBSD 7.0:
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
-//  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
-// To FreeBSD 6.3 (it's the same in 6-STABLE):
-//  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
-//  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
-#if __FreeBSD__ >= 7
-  uint64_t hz = 0;
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return error_value;
+  }
+  f.close();
+  // If we found the bogomips clock, but nothing better, we'll use it (but
+  // we're not happy about it); otherwise, fallback to the rough estimation
+  // below.
+  if (bogo_clock >= 0.0) return bogo_clock;
+
+#elif defined BENCHMARK_HAS_SYSCTL
+  constexpr auto* FreqStr =
+#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
+      "machdep.tsc_freq";
+#elif defined BENCHMARK_OS_OPENBSD
+      "hw.cpuspeed";
+#elif defined BENCHMARK_OS_DRAGONFLY
+      "hw.tsc_frequency";
 #else
-  unsigned int hz = 0;
+      "hw.cpufrequency";
 #endif
-  size_t sz = sizeof(hz);
-  const char* sysctl_path = "machdep.tsc_freq";
-  if (sysctlbyname(sysctl_path, &hz, &sz, nullptr, 0) != 0) {
-    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-            sysctl_path, strerror(errno));
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-  } else {
-    cpuinfo_cycles_per_second = hz;
-  }
-// TODO: also figure out cpuinfo_num_cpus
-
+  unsigned long long hz = 0;
+#if defined BENCHMARK_OS_OPENBSD
+  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+#else
+  if (GetSysctl(FreqStr, &hz)) return hz;
+#endif
+  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+          FreqStr, strerror(errno));
 
 #elif defined BENCHMARK_OS_WINDOWS
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
@@ -249,176 +631,86 @@ void InitializeSystemInfo() {
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    cpuinfo_cycles_per_second = static_cast<double>((int64_t)data * (int64_t)(1000 * 1000));  // was mhz
-  else
-    cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-
-  SYSTEM_INFO sysinfo = { 0 };
-  GetSystemInfo(&sysinfo);
-  cpuinfo_num_cpus = sysinfo.dwNumberOfProcessors; // number of logical processors in the current group
-
-#elif defined BENCHMARK_OS_MACOSX
-  // returning "mach time units" per second. the current number of elapsed
-  // mach time units can be found by calling uint64 mach_absolute_time();
-  // while not as precise as actual CPU cycles, it is accurate in the face
-  // of CPU frequency scaling and multi-cpu/core machines.
-  // Our mac users have these types of machines, and accuracy
-  // (i.e. correctness) trumps precision.
-  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
-  // units on Mac OS X.
-  mach_timebase_info_data_t timebase_info;
-  mach_timebase_info(&timebase_info);
-  double mach_time_units_per_nanosecond =
-      static_cast<double>(timebase_info.denom) /
-      static_cast<double>(timebase_info.numer);
-  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
-
-  int num_cpus = 0;
-  size_t size = sizeof(num_cpus);
-  int numcpus_name[] = {CTL_HW, HW_NCPU};
-  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0) ==
-          0 &&
-      (size == sizeof(num_cpus)))
-    cpuinfo_num_cpus = num_cpus;
-
-#else
-  // Generic cycles per second counter
-  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
-#endif
-}
-}  // end namespace
-
-// getrusage() based implementation of MyCPUUsage
-static double MyCPUUsageRUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
+    return static_cast<double>((int64_t)data *
+                               (int64_t)(1000 * 1000));  // was mhz
+#elif defined (BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t *kc = kstat_open();
+  if (!kc) {
+    std::cerr << "failed to open /dev/kstat\n";
+    return -1;
   }
-#else
-  HANDLE proc = GetCurrentProcess();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  ULARGE_INTEGER kernel;
-  ULARGE_INTEGER user;
-  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
-  kernel.HighPart = kernel_time.dwHighDateTime;
-  kernel.LowPart = kernel_time.dwLowDateTime;
-  user.HighPart = user_time.dwHighDateTime;
-  user.LowPart = user_time.dwLowDateTime;
-  return (static_cast<double>(kernel.QuadPart) +
-          static_cast<double>(user.QuadPart)) * 1e-7;
-#endif  // OS_WINDOWS
-}
-
-#ifndef BENCHMARK_OS_WINDOWS
-static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
-  static int cputime_fd = -1;
-  if (cputime_fd == -1) {
-    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
-    if (cputime_fd < 0) {
-      cputime_fd = -1;
-      return false;
-    }
+  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  if (!ksp) {
+    std::cerr << "failed to lookup in /dev/kstat\n";
+    return -1;
   }
-  char buff[64];
-  memset(buff, 0, sizeof(buff));
-  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
+  if (kstat_read(kc, ksp, NULL) < 0) {
+    std::cerr << "failed to read from /dev/kstat\n";
+    return -1;
   }
-  unsigned long long result = strtoull(buff, nullptr, 0);
-  if (result == (std::numeric_limits<unsigned long long>::max)()) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
+  kstat_named_t *knp =
+      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
+  if (!knp) {
+    std::cerr << "failed to lookup data in /dev/kstat\n";
+    return -1;
   }
-  *cputime = static_cast<double>(result) / 1e9;
-  return true;
-}
-#endif  // OS_WINDOWS
-
-double MyCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  {
-    std::lock_guard<std::mutex> l(cputimens_mutex);
-    static bool use_cputime_ns = true;
-    if (use_cputime_ns) {
-      double value;
-      if (MyCPUUsageCPUTimeNsLocked(&value)) {
-        return value;
-      }
-      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
-      use_cputime_ns = false;
-    }
+  if (knp->data_type != KSTAT_DATA_UINT64) {
+    std::cerr << "current_clock_Hz is of unexpected data type: "
+              << knp->data_type << "\n";
+    return -1;
   }
-#endif  // OS_WINDOWS
-  return MyCPUUsageRUsage();
+  double clock_hz = knp->value.ui64;
+  kstat_close(kc);
+  return clock_hz;
+#elif defined (BENCHMARK_OS_QNX)
+  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
+                             (int64_t)(1000 * 1000));
+#endif
+  // If we've fallen through, attempt to roughly estimate the CPU clock rate.
+  const int estimate_time_ms = 1000;
+  const auto start_ticks = cycleclock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  return static_cast<double>(cycleclock::Now() - start_ticks);
 }
 
-double ChildrenCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+std::vector<double> GetLoadAvg() {
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
+    !defined(__ANDROID__)
+  constexpr int kMaxSamples = 3;
+  std::vector<double> res(kMaxSamples, 0.0);
+  const int nelem = getloadavg(res.data(), kMaxSamples);
+  if (nelem < 1) {
+    res.clear();
   } else {
-    return 0.0;
+    res.resize(nelem);
   }
+  return res;
 #else
-  // TODO: Not sure what this even means on Windows
-  return 0.0;
-#endif  // OS_WINDOWS
+  return {};
+#endif
 }
 
-double CyclesPerSecond(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_cycles_per_second;
-}
+}  // end namespace
 
-int NumCPUs(void) {
-  std::call_once(cpuinfo_init, InitializeSystemInfo);
-  return cpuinfo_num_cpus;
+const CPUInfo& CPUInfo::Get() {
+  static const CPUInfo* info = new CPUInfo();
+  return *info;
 }
 
-// The ""'s catch people who don't pass in a literal for "str"
-#define strliterallen(str) (sizeof("" str "") - 1)
+CPUInfo::CPUInfo()
+    : num_cpus(GetNumCPUs()),
+      cycles_per_second(GetCPUCyclesPerSecond()),
+      caches(GetCacheSizes()),
+      scaling(CpuScaling(num_cpus)),
+      load_avg(GetLoadAvg()) {}
 
-// Must use a string literal for prefix.
-#define memprefix(str, len, prefix)                       \
-  ((((len) >= strliterallen(prefix)) &&                   \
-    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
-       ? str + strliterallen(prefix)                      \
-       : nullptr)
 
-bool CpuScalingEnabled() {
-#ifndef BENCHMARK_OS_WINDOWS
-  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
-  // local file system. If reading the exported files fails, then we may not be
-  // running on Linux, so we silently ignore all the read errors.
-  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
-                                       "/cpufreq/scaling_governor");
-    FILE* file = fopen(governor_file.c_str(), "r");
-    if (!file) break;
-    char buff[16];
-    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
-    fclose(file);
-    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
-  }
-#endif
-  return false;
+const SystemInfo& SystemInfo::Get() {
+  static const SystemInfo* info = new SystemInfo();
+  return *info;
 }
 
+SystemInfo::SystemInfo() : name(GetSystemName()) {}
 }  // end namespace benchmark
diff --git a/src/sysinfo.h b/src/sysinfo.h
deleted file mode 100644
index eaf77e0..0000000
--- a/src/sysinfo.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef BENCHMARK_SYSINFO_H_
-#define BENCHMARK_SYSINFO_H_
-
-namespace benchmark {
-double MyCPUUsage();
-double ChildrenCPUUsage();
-int NumCPUs();
-double CyclesPerSecond();
-bool CpuScalingEnabled();
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SYSINFO_H_
diff --git a/src/thread_manager.h b/src/thread_manager.h
new file mode 100644
index 0000000..28e2dd5
--- /dev/null
+++ b/src/thread_manager.h
@@ -0,0 +1,64 @@
+#ifndef BENCHMARK_THREAD_MANAGER_H
+#define BENCHMARK_THREAD_MANAGER_H
+
+#include <atomic>
+
+#include "benchmark/benchmark.h"
+#include "mutex.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadManager {
+ public:
+  explicit ThreadManager(int num_threads)
+      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
+
+  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
+    return benchmark_mutex_;
+  }
+
+  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
+    return start_stop_barrier_.wait();
+  }
+
+  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
+    start_stop_barrier_.removeThread();
+    if (--alive_threads_ == 0) {
+      MutexLock lock(end_cond_mutex_);
+      end_condition_.notify_all();
+    }
+  }
+
+  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
+    MutexLock lock(end_cond_mutex_);
+    end_condition_.wait(lock.native_handle(),
+                        [this]() { return alive_threads_ == 0; });
+  }
+
+ public:
+  struct Result {
+    IterationCount iterations = 0;
+    double real_time_used = 0;
+    double cpu_time_used = 0;
+    double manual_time_used = 0;
+    int64_t complexity_n = 0;
+    std::string report_label_;
+    std::string error_message_;
+    bool has_error_ = false;
+    UserCounters counters;
+  };
+  GUARDED_BY(GetBenchmarkMutex()) Result results;
+
+ private:
+  mutable Mutex benchmark_mutex_;
+  std::atomic<int> alive_threads_;
+  Barrier start_stop_barrier_;
+  Mutex end_cond_mutex_;
+  Condition end_condition_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_MANAGER_H
diff --git a/src/thread_timer.h b/src/thread_timer.h
new file mode 100644
index 0000000..1703ca0
--- /dev/null
+++ b/src/thread_timer.h
@@ -0,0 +1,86 @@
+#ifndef BENCHMARK_THREAD_TIMER_H
+#define BENCHMARK_THREAD_TIMER_H
+
+#include "check.h"
+#include "timers.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadTimer {
+  explicit ThreadTimer(bool measure_process_cpu_time_)
+      : measure_process_cpu_time(measure_process_cpu_time_) {}
+
+ public:
+  static ThreadTimer Create() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/false);
+  }
+  static ThreadTimer CreateProcessCpuTime() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/true);
+  }
+
+  // Called by each thread
+  void StartTimer() {
+    running_ = true;
+    start_real_time_ = ChronoClockNow();
+    start_cpu_time_ = ReadCpuTimerOfChoice();
+  }
+
+  // Called by each thread
+  void StopTimer() {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += ChronoClockNow() - start_real_time_;
+    // Floating point error can result in the subtraction producing a negative
+    // time. Guard against that.
+    cpu_time_used_ +=
+        std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
+  }
+
+  // Called by each thread
+  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }
+
+  bool running() const { return running_; }
+
+  // REQUIRES: timer is not running
+  double real_time_used() const {
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() const {
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double manual_time_used() const {
+    CHECK(!running_);
+    return manual_time_used_;
+  }
+
+ private:
+  double ReadCpuTimerOfChoice() const {
+    if (measure_process_cpu_time) return ProcessCPUUsage();
+    return ThreadCPUUsage();
+  }
+
+  // should the thread, or the process, time be measured?
+  const bool measure_process_cpu_time;
+
+  bool running_ = false;        // Is the timer running
+  double start_real_time_ = 0;  // If running_
+  double start_cpu_time_ = 0;   // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_ = 0;
+  double cpu_time_used_ = 0;
+  // Manually set iteration time. User sets this with SetIterationTime(seconds).
+  double manual_time_used_ = 0;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_TIMER_H
diff --git a/src/timers.cc b/src/timers.cc
new file mode 100644
index 0000000..1d3ab9a
--- /dev/null
+++ b/src/timers.cc
@@ -0,0 +1,245 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "timers.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <shlwapi.h>
+#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
+#include <versionhelpers.h>
+#include <windows.h>
+#else
+#include <fcntl.h>
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
+    defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#endif
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "check.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+
+// Suppress unused warnings on helper functions.
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace {
+#if defined(BENCHMARK_OS_WINDOWS)
+double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) *
+         1e-7;
+}
+#elif !defined(BENCHMARK_OS_FUCHSIA)
+double MakeTime(struct rusage const& ru) {
+  return (static_cast<double>(ru.ru_utime.tv_sec) +
+          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+          static_cast<double>(ru.ru_stime.tv_sec) +
+          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+}
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+double MakeTime(thread_basic_info_data_t const& info) {
+  return (static_cast<double>(info.user_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6 +
+          static_cast<double>(info.system_time.seconds) +
+          static_cast<double>(info.system_time.microseconds) * 1e-6);
+}
+#endif
+#if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
+double MakeTime(struct timespec const& ts) {
+  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+}
+#endif
+
+BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
+  std::cerr << "ERROR: " << msg << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+}  // end namespace
+
+double ProcessCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time,
+                      &user_time))
+    return MakeTime(kernel_time, user_time);
+  DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
+  // Use Emscripten-specific API. Reported CPU time would be exactly the
+  // same as total time, but this is ok because there aren't long-latency
+  // syncronous system calls in Emscripten.
+  return emscripten_get_now() * 1e-3;
+#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#else
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
+  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
+#endif
+}
+
+double ThreadCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE this_thread = GetCurrentThread();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
+                 &user_time);
+  return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t info;
+  mach_port_t thread = pthread_mach_thread_np(pthread_self());
+  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
+      KERN_SUCCESS) {
+    return MakeTime(info);
+  }
+  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // Emscripten doesn't support traditional threads
+  return ProcessCPUUsage();
+#elif defined(BENCHMARK_OS_RTEMS)
+  // RTEMS doesn't support CLOCK_THREAD_CPUTIME_ID. See
+  // https://github.com/RTEMS/rtems/blob/master/cpukit/posix/src/clockgettime.c
+  return ProcessCPUUsage();
+#elif defined(BENCHMARK_OS_SOLARIS)
+  struct rusage ru;
+  if (getrusage(RUSAGE_LWP, &ru) == 0) return MakeTime(ru);
+  DiagnoseAndExit("getrusage(RUSAGE_LWP, ...) failed");
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
+#else
+#error Per-thread timing is not available on your system.
+#endif
+}
+
+std::string LocalDateTimeString() {
+  // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM.
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  const std::size_t kTzOffsetLen = 6;
+  const std::size_t kTimestampLen = 19;
+
+  std::size_t tz_len;
+  std::size_t timestamp_len;
+  long int offset_minutes;
+  char tz_offset_sign = '+';
+  // Long enough buffers to avoid format-overflow warnings
+  char tz_offset[128];
+  char storage[128];
+
+#if defined(BENCHMARK_OS_WINDOWS)
+  std::tm *timeinfo_p = ::localtime(&now);
+#else
+  std::tm timeinfo;
+  std::tm *timeinfo_p = &timeinfo;
+  ::localtime_r(&now, &timeinfo);
+#endif
+
+  tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p);
+
+  if (tz_len < kTzOffsetLen && tz_len > 1) {
+    // Timezone offset was written. strftime writes offset as +HHMM or -HHMM,
+    // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse
+    // the offset as an integer, then reprint it to a string.
+
+    offset_minutes = ::strtol(tz_offset, NULL, 10);
+    if (offset_minutes < 0) {
+      offset_minutes *= -1;
+      tz_offset_sign = '-';
+    }
+
+    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len); // Prevent unused variable warning in optimized build.
+  } else {
+    // Unknown offset. RFC3339 specifies that unknown local offsets should be
+    // written as UTC time with -00:00 timezone.
+#if defined(BENCHMARK_OS_WINDOWS)
+    // Potential race condition if another thread calls localtime or gmtime.
+    timeinfo_p = ::gmtime(&now);
+#else
+    ::gmtime_r(&now, &timeinfo);
+#endif
+
+    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
+  }
+
+  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
+      timeinfo_p);
+  CHECK(timestamp_len == kTimestampLen);
+  // Prevent unused variable warning in optimized build.
+  ((void)kTimestampLen);
+
+  std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1);
+  return std::string(storage);
+}
+
+}  // end namespace benchmark
diff --git a/src/timers.h b/src/timers.h
new file mode 100644
index 0000000..65606cc
--- /dev/null
+++ b/src/timers.h
@@ -0,0 +1,48 @@
+#ifndef BENCHMARK_TIMERS_H
+#define BENCHMARK_TIMERS_H
+
+#include <chrono>
+#include <string>
+
+namespace benchmark {
+
+// Return the CPU usage of the current process
+double ProcessCPUUsage();
+
+// Return the CPU usage of the children of the current process
+double ChildrenCPUUsage();
+
+// Return the CPU usage of the current thread
+double ThreadCPUUsage();
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+  typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+  typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+inline double ChronoClockNow() {
+  typedef ChooseClockType::type ClockType;
+  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
+  return FpSeconds(ClockType::now().time_since_epoch()).count();
+}
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_TIMERS_H
diff --git a/src/walltime.cc b/src/walltime.cc
deleted file mode 100644
index 4bdbaa5..0000000
--- a/src/walltime.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/macros.h"
-#include "internal_macros.h"
-#include "walltime.h"
-
-#if defined(BENCHMARK_OS_WINDOWS)
-#include <time.h>
-#include <winsock.h> // for timeval
-#else
-#include <sys/time.h>
-#endif
-
-#include <cstdio>
-#include <cstdint>
-#include <cstring>
-#include <ctime>
-
-#include <atomic>
-#include <chrono>
-#include <limits>
-
-#include "arraysize.h"
-#include "check.h"
-#include "cycleclock.h"
-#include "log.h"
-#include "sysinfo.h"
-
-namespace benchmark {
-namespace walltime {
-
-namespace {
-
-#if defined(HAVE_STEADY_CLOCK)
-template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
-struct ChooseSteadyClock {
-    typedef std::chrono::high_resolution_clock type;
-};
-
-template <>
-struct ChooseSteadyClock<false> {
-    typedef std::chrono::steady_clock type;
-};
-#endif
-
-struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
-  typedef ChooseSteadyClock<>::type type;
-#else
-  typedef std::chrono::high_resolution_clock type;
-#endif
-};
-
-class WallTimeImp
-{
-public:
-  WallTime Now();
-
-  static WallTimeImp& GetWallTimeImp() {
-    static WallTimeImp* imp = new WallTimeImp();
-    return *imp;
-  }
-
-private:
-  WallTimeImp();
-  // Helper routines to load/store a float from an AtomicWord. Required because
-  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
-  // get rid of this horror show.
-  void SetDrift(float f) {
-    int32_t w;
-    memcpy(&w, &f, sizeof(f));
-    std::atomic_store(&drift_adjust_, w);
-  }
-
-  float GetDrift() const {
-    float f;
-    int32_t w = std::atomic_load(&drift_adjust_);
-    memcpy(&f, &w, sizeof(f));
-    return f;
-  }
-
-  WallTime Slow() const {
-    struct timeval tv;
-#if defined(BENCHMARK_OS_WINDOWS)
-    FILETIME    file_time;
-    SYSTEMTIME  system_time;
-    ULARGE_INTEGER ularge;
-    const unsigned __int64 epoch = 116444736000000000LL;
-
-    GetSystemTime(&system_time);
-    SystemTimeToFileTime(&system_time, &file_time);
-    ularge.LowPart = file_time.dwLowDateTime;
-    ularge.HighPart = file_time.dwHighDateTime;
-
-    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
-    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
-#else
-    gettimeofday(&tv, nullptr);
-#endif
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-  }
-
-private:
-  static_assert(sizeof(float) <= sizeof(int32_t),
-               "type sizes don't allow the drift_adjust hack");
-
-  WallTime base_walltime_;
-  int64_t base_cycletime_;
-  int64_t cycles_per_second_;
-  double seconds_per_cycle_;
-  uint32_t last_adjust_time_;
-  std::atomic<int32_t> drift_adjust_;
-  int64_t max_interval_cycles_;
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
-};
-
-
-WallTime WallTimeImp::Now() {
-  WallTime now = 0.0;
-  WallTime result = 0.0;
-  int64_t ct = 0;
-  uint32_t top_bits = 0;
-  do {
-    ct = cycleclock::Now();
-    int64_t cycle_delta = ct - base_cycletime_;
-    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
-
-    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
-    // Recompute drift no more often than every 2^32 cycles.
-    // I.e., @2GHz, ~ every two seconds
-    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
-      return result + GetDrift();
-    }
-
-    now = Slow();
-  } while (cycleclock::Now() - ct > max_interval_cycles_);
-  // We are now sure that "now" and "result" were produced within
-  // kMaxErrorInterval of one another.
-
-  SetDrift(static_cast<float>(now - result));
-  last_adjust_time_ = top_bits;
-  return now;
-}
-
-
-WallTimeImp::WallTimeImp()
-    : base_walltime_(0.0), base_cycletime_(0),
-      cycles_per_second_(0), seconds_per_cycle_(0.0),
-      last_adjust_time_(0), drift_adjust_(0),
-      max_interval_cycles_(0) {
-  const double kMaxErrorInterval = 100e-6;
-  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
-  CHECK(cycles_per_second_ != 0);
-  seconds_per_cycle_ = 1.0 / cycles_per_second_;
-  max_interval_cycles_ =
-      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
-  do {
-    base_cycletime_ = cycleclock::Now();
-    base_walltime_ = Slow();
-  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
-  // We are now sure that "base_walltime" and "base_cycletime" were produced
-  // within kMaxErrorInterval of one another.
-
-  SetDrift(0.0);
-  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
-}
-
-WallTime CPUWalltimeNow() {
-  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
-  return imp.Now();
-}
-
-WallTime ChronoWalltimeNow() {
-  typedef ChooseClockType::type Clock;
-  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
-          FPSeconds;
-  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
-                "This type must be treated as a floating point type.");
-  auto now = Clock::now().time_since_epoch();
-  return std::chrono::duration_cast<FPSeconds>(now).count();
-}
-
-bool UseCpuCycleClock() {
-    bool useWallTime = !CpuScalingEnabled();
-    if (useWallTime) {
-        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
-    } else {
-        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
-    }
-    return useWallTime;
-}
-
-
-} // end anonymous namespace
-
-// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
-// enabled at the start of the program then std::chrono::system_clock is used
-// instead.
-WallTime Now()
-{
-  static bool useCPUClock = UseCpuCycleClock();
-  if (useCPUClock) {
-    return CPUWalltimeNow();
-  } else {
-    return ChronoWalltimeNow();
-  }
-}
-
-}  // end namespace walltime
-
-
-namespace {
-
-std::string DateTimeString(bool local) {
-  typedef std::chrono::system_clock Clock;
-  std::time_t now = Clock::to_time_t(Clock::now());
-  char storage[128];
-  std::size_t written;
-
-  if (local) {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  } else {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  }
-  CHECK(written < arraysize(storage));
-  ((void)written); // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
-
-} // end namespace
-
-std::string LocalDateTimeString() {
-  return DateTimeString(true);
-}
-
-}  // end namespace benchmark
diff --git a/src/walltime.h b/src/walltime.h
deleted file mode 100644
index 38c26f3..0000000
--- a/src/walltime.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef BENCHMARK_WALLTIME_H_
-#define BENCHMARK_WALLTIME_H_
-
-#include <string>
-
-namespace benchmark {
-typedef double WallTime;
-
-namespace walltime {
-WallTime Now();
-}  // end namespace walltime
-
-std::string LocalDateTimeString();
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_WALLTIME_H_
diff --git a/test/AssemblyTests.cmake b/test/AssemblyTests.cmake
new file mode 100644
index 0000000..3d07858
--- /dev/null
+++ b/test/AssemblyTests.cmake
@@ -0,0 +1,46 @@
+
+include(split_list)
+
+set(ASM_TEST_FLAGS "")
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  list(APPEND ASM_TEST_FLAGS -O3)
+endif()
+
+check_cxx_compiler_flag(-g0 BENCHMARK_HAS_G0_FLAG)
+if (BENCHMARK_HAS_G0_FLAG)
+  list(APPEND ASM_TEST_FLAGS -g0)
+endif()
+
+check_cxx_compiler_flag(-fno-stack-protector BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+if (BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+  list(APPEND ASM_TEST_FLAGS -fno-stack-protector)
+endif()
+
+split_list(ASM_TEST_FLAGS)
+string(TOUPPER "${CMAKE_CXX_COMPILER_ID}" ASM_TEST_COMPILER)
+
+macro(add_filecheck_test name)
+  cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
+  add_library(${name} OBJECT ${name}.cc)
+  set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
+  set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
+  add_custom_target(copy_${name} ALL
+      COMMAND ${PROJECT_SOURCE_DIR}/tools/strip_asm.py
+        $<TARGET_OBJECTS:${name}>
+        ${ASM_OUTPUT_FILE}
+      BYPRODUCTS ${ASM_OUTPUT_FILE})
+  add_dependencies(copy_${name} ${name})
+  if (NOT ARG_CHECK_PREFIXES)
+    set(ARG_CHECK_PREFIXES "CHECK")
+  endif()
+  foreach(prefix ${ARG_CHECK_PREFIXES})
+    add_test(NAME run_${name}_${prefix}
+        COMMAND
+          ${LLVM_FILECHECK_EXE} ${name}.cc
+          --input-file=${ASM_OUTPUT_FILE}
+          --check-prefixes=CHECK,CHECK-${ASM_TEST_COMPILER}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endforeach()
+endmacro()
+
diff --git a/test/BUILD b/test/BUILD
new file mode 100644
index 0000000..9bb8cb0
--- /dev/null
+++ b/test/BUILD
@@ -0,0 +1,73 @@
+TEST_COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+]
+
+PER_SRC_COPTS = ({
+    "cxx03_test.cc": ["-std=c++03"],
+    # Some of the issues with DoNotOptimize only occur when optimization is enabled
+    "donotoptimize_test.cc": ["-O3"],
+})
+
+TEST_ARGS = ["--benchmark_min_time=0.01"]
+
+PER_SRC_TEST_ARGS = ({
+    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+})
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+cc_library(
+    name = "output_test_helper",
+    testonly = 1,
+    srcs = ["output_test_helper.cc"],
+    hdrs = ["output_test.h"],
+    copts = TEST_COPTS,
+    deps = [
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+[
+    cc_test(
+        name = test_src[:-len(".cc")],
+        size = "small",
+        srcs = [test_src],
+        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
+        copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
+        deps = [
+            ":output_test_helper",
+            "//:benchmark",
+            "//:benchmark_internal_headers",
+            "@com_google_googletest//:gtest",
+        ] + (
+            ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
+        ),
+        # FIXME: Add support for assembly tests to bazel.
+        # See Issue #556
+        # https://github.com/google/benchmark/issues/556
+    )
+    for test_src in glob(
+        ["*test.cc"],
+        exclude = [
+            "*_assembly_test.cc",
+            "link_main_test.cc",
+        ],
+    )
+]
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = TEST_COPTS,
+    deps = ["//:benchmark_main"],
+)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index aeb720a..c1a3a3f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,70 +1,222 @@
 # Enable the tests
 
 find_package(Threads REQUIRED)
+include(CheckCXXCompilerFlag)
+
+# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
+# strip -DNDEBUG from the default CMake flags in DEBUG mode.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+  add_definitions( -UNDEBUG )
+  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
+  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+  foreach (flags_var_to_scrub
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS_MINSIZEREL)
+    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+  endforeach()
+endif()
+
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+set(BENCHMARK_O3_FLAG "")
+if (BENCHMARK_HAS_O3_FLAG)
+  set(BENCHMARK_O3_FLAG "-O3")
+endif()
+
+# NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
+# they will break the configuration check.
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+endif()
+
+add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+  target_link_libraries(${name} benchmark::benchmark ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_benchmark_test)
 
+macro(compile_benchmark_test_with_main name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark::benchmark_main)
+endmacro(compile_benchmark_test_with_main)
+
+macro(compile_output_test name)
+  add_executable(${name} "${name}.cc" output_test.h)
+  target_link_libraries(${name} output_test_helper benchmark::benchmark
+          ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_output_test)
+
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
-add_test(benchmark benchmark_test --benchmark_min_time=0.01)
+add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
-  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
 add_filter_test(filter_simple "Foo" 3)
+add_filter_test(filter_simple_negative "-Foo" 2)
 add_filter_test(filter_suffix "BM_.*" 4)
+add_filter_test(filter_suffix_negative "-BM_.*" 1)
 add_filter_test(filter_regex_all ".*" 5)
+add_filter_test(filter_regex_all_negative "-.*" 0)
 add_filter_test(filter_regex_blank "" 5)
+add_filter_test(filter_regex_blank_negative "-" 0)
 add_filter_test(filter_regex_none "monkey" 0)
+add_filter_test(filter_regex_none_negative "-monkey" 5)
 add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
+add_filter_test(filter_regex_wildcard_negative "-.*Foo.*" 2)
 add_filter_test(filter_regex_begin "^BM_.*" 4)
+add_filter_test(filter_regex_begin_negative "-^BM_.*" 1)
 add_filter_test(filter_regex_begin2 "^N" 1)
+add_filter_test(filter_regex_begin2_negative "-^N" 4)
 add_filter_test(filter_regex_end ".*Ba$" 1)
+add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
-add_test(options_benchmarks options_test --benchmark_min_time=0.01)
+add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(basic_test)
-add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
+add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(diagnostics_test)
-add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
+add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(skip_with_error_test)
-add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
+add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(donotoptimize_test)
-add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
+# Some of the issues with DoNotOptimize only occur when optimization is enabled
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
+endif()
+add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(fixture_test)
-add_test(fixture_test fixture_test --benchmark_min_time=0.01)
+add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(register_benchmark_test)
+add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01)
 
 compile_benchmark_test(map_test)
-add_test(map_test map_test --benchmark_min_time=0.01)
+add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(multiple_ranges_test)
+add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(args_product_test)
+add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01)
+
+compile_benchmark_test_with_main(link_main_test)
+add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01)
+
+compile_output_test(reporter_output_test)
+add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01)
+
+compile_output_test(templated_fixture_test)
+add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_test)
+add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
+
+compile_output_test(internal_threading_test)
+add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
 
-compile_benchmark_test(reporter_output_test)
-add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
+compile_output_test(report_aggregates_only_test)
+add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(display_aggregates_only_test)
+add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_tabular_test)
+add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_thousands_test)
+add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01)
+
+compile_output_test(memory_manager_test)
+add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01)
 
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
-  set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REPLACE "-std=c++11" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-  string(REPLACE "-std=c++0x" "-std=c++03" CXX03_FLAGS "${CXX03_FLAGS}")
-
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
-      PROPERTIES COMPILE_FLAGS "${CXX03_FLAGS}")
-  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+      PROPERTIES
+      CXX_STANDARD 98
+      CXX_STANDARD_REQUIRED YES)
+  # libstdc++ provides different definitions within <map> between dialects. When
+  # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
+  # causing the test to fail to compile. To prevent this we explicitly disable
+  # the warning.
+  check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
+  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
+    set_target_properties(cxx03_test
+        PROPERTIES
+        LINK_FLAGS "-Wno-odr")
+  endif()
+  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01)
+endif()
+
+# Attempt to work around flaky test failures when running on Appveyor servers.
+if (DEFINED ENV{APPVEYOR})
+  set(COMPLEXITY_MIN_TIME "0.5")
+else()
+  set(COMPLEXITY_MIN_TIME "0.01")
+endif()
+compile_output_test(complexity_test)
+add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
+
+###############################################################################
+# GoogleTest Unit Tests
+###############################################################################
+
+if (BENCHMARK_ENABLE_GTEST_TESTS)
+  macro(compile_gtest name)
+    add_executable(${name} "${name}.cc")
+    target_link_libraries(${name} benchmark::benchmark
+        gmock_main ${CMAKE_THREAD_LIBS_INIT})
+  endmacro(compile_gtest)
+
+  macro(add_gtest name)
+    compile_gtest(${name})
+    add_test(NAME ${name} COMMAND ${name})
+  endmacro()
+
+  add_gtest(benchmark_gtest)
+  add_gtest(benchmark_name_gtest)
+  add_gtest(commandlineflags_gtest)
+  add_gtest(statistics_gtest)
+  add_gtest(string_util_gtest)
+endif(BENCHMARK_ENABLE_GTEST_TESTS)
+
+###############################################################################
+# Assembly Unit Tests
+###############################################################################
+
+if (BENCHMARK_ENABLE_ASSEMBLY_TESTS)
+  if (NOT LLVM_FILECHECK_EXE)
+    message(FATAL_ERROR "LLVM FileCheck is required when including this file")
+  endif()
+  include(AssemblyTests.cmake)
+  add_filecheck_test(donotoptimize_assembly_test)
+  add_filecheck_test(state_assembly_test)
+  add_filecheck_test(clobber_memory_assembly_test)
 endif()
 
-compile_benchmark_test(complexity_test)
-add_test(complexity_benchmark complexity_test --benchmark_min_time=0.01)
+
+
+###############################################################################
+# Code Coverage Configuration
+###############################################################################
 
 # Add the coverage command(s)
 if(CMAKE_BUILD_TYPE)
diff --git a/test/args_product_test.cc b/test/args_product_test.cc
new file mode 100644
index 0000000..8a859f8
--- /dev/null
+++ b/test/args_product_test.cc
@@ -0,0 +1,77 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+class ArgsProductFixture : public ::benchmark::Fixture {
+ public:
+  ArgsProductFixture()
+      : expectedValues({{0, 100, 2000, 30000},
+                        {1, 15, 3, 8},
+                        {1, 15, 3, 9},
+                        {1, 15, 7, 8},
+                        {1, 15, 7, 9},
+                        {1, 15, 10, 8},
+                        {1, 15, 10, 9},
+                        {2, 15, 3, 8},
+                        {2, 15, 3, 9},
+                        {2, 15, 7, 8},
+                        {2, 15, 7, 9},
+                        {2, 15, 10, 8},
+                        {2, 15, 10, 9},
+                        {4, 5, 6, 11}}) {}
+
+  void SetUp(const ::benchmark::State& state) {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2), state.range(3)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  virtual ~ArgsProductFixture() {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(ArgsProductFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product =
+        state.range(0) * state.range(1) * state.range(2) * state.range(3);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(ArgsProductFixture, Empty)
+    ->Args({0, 100, 2000, 30000})
+    ->ArgsProduct({{1, 2}, {15}, {3, 7, 10}, {8, 9}})
+    ->Args({4, 5, 6, 11});
+
+BENCHMARK_MAIN();
diff --git a/test/basic_test.cc b/test/basic_test.cc
index 3435415..5f3dd1a 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -1,11 +1,10 @@
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
-#define BASIC_BENCHMARK_TEST(x) \
-    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
 
 void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
 }
@@ -13,8 +12,8 @@ BENCHMARK(BM_empty);
 BENCHMARK(BM_empty)->ThreadPerCpu();
 
 void BM_spin_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    for (int x = 0; x < state.range_x(); ++x) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
       benchmark::DoNotOptimize(x);
     }
   }
@@ -23,11 +22,11 @@ BASIC_BENCHMARK_TEST(BM_spin_empty);
 BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
 
 void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -35,15 +34,14 @@ void BM_spin_pause_before(benchmark::State& state) {
 BASIC_BENCHMARK_TEST(BM_spin_pause_before);
 BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
 
-
 void BM_spin_pause_during(benchmark::State& state) {
-  while(state.KeepRunning()) {
+  for (auto _ : state) {
     state.PauseTiming();
-    for (int i = 0; i < state.range_x(); ++i) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
     state.ResumeTiming();
-    for (int i = 0; i < state.range_x(); ++i) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
@@ -52,7 +50,7 @@ BASIC_BENCHMARK_TEST(BM_spin_pause_during);
 BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
 
 void BM_pause_during(benchmark::State& state) {
-  while(state.KeepRunning()) {
+  for (auto _ : state) {
     state.PauseTiming();
     state.ResumeTiming();
   }
@@ -63,40 +61,76 @@ BENCHMARK(BM_pause_during)->UseRealTime();
 BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
 
 void BM_spin_pause_after(benchmark::State& state) {
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
 BASIC_BENCHMARK_TEST(BM_spin_pause_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
 
-
 void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
-  while(state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i) {
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
       benchmark::DoNotOptimize(i);
     }
   }
-  for (int i = 0; i < state.range_x(); ++i) {
+  for (int i = 0; i < state.range(0); ++i) {
     benchmark::DoNotOptimize(i);
   }
 }
 BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
 
-
 void BM_empty_stop_start(benchmark::State& state) {
-  while (state.KeepRunning()) { }
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
-BENCHMARK_MAIN()
+
+void BM_KeepRunning(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  assert(iter_count == state.iterations());
+  while (state.KeepRunning()) {
+    ++iter_count;
+  }
+  assert(iter_count == state.iterations());
+}
+BENCHMARK(BM_KeepRunning);
+
+void BM_KeepRunningBatch(benchmark::State& state) {
+  // Choose a prime batch size to avoid evenly dividing max_iterations.
+  const benchmark::IterationCount batch_size = 101;
+  benchmark::IterationCount iter_count = 0;
+  while (state.KeepRunningBatch(batch_size)) {
+    iter_count += batch_size;
+  }
+  assert(state.iterations() == iter_count);
+}
+BENCHMARK(BM_KeepRunningBatch);
+
+void BM_RangedFor(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  for (auto _ : state) {
+    ++iter_count;
+  }
+  assert(iter_count == state.max_iterations);
+}
+BENCHMARK(BM_RangedFor);
+
+// Ensure that StateIterator provides all the necessary typedefs required to
+// instantiate std::iterator_traits.
+static_assert(std::is_same<
+  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
+  typename benchmark::State::StateIterator::value_type>::value, "");
+
+BENCHMARK_MAIN();
diff --git a/test/benchmark_gtest.cc b/test/benchmark_gtest.cc
new file mode 100644
index 0000000..6dbf7a5
--- /dev/null
+++ b/test/benchmark_gtest.cc
@@ -0,0 +1,134 @@
+#include <vector>
+
+#include "../src/benchmark_register.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+TEST(AddRangeTest, Simple) {
+  std::vector<int> dst;
+  AddRange(&dst, 1, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Simple64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(1), static_cast<int64_t>(2), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Advanced) {
+  std::vector<int> dst;
+  AddRange(&dst, 5, 15, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, Advanced64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(5), static_cast<int64_t>(15), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, FullRange8) {
+  std::vector<int8_t> dst;
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+}
+
+TEST(AddRangeTest, FullRange64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, int64_t{1}, std::numeric_limits<int64_t>::max(), 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAre(1LL, 1024LL, 1048576LL, 1073741824LL,
+                                1099511627776LL, 1125899906842624LL,
+                                1152921504606846976LL, 9223372036854775807LL));
+}
+
+TEST(AddRangeTest, NegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0));
+}
+
+TEST(AddRangeTest, StrictlyNegative) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, -1, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0, 1, 2, 4, 8));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRangesOddMult) {
+  std::vector<int> dst;
+  AddRange(&dst, -30, 32, 5);
+  EXPECT_THAT(dst, testing::ElementsAre(-30, -25, -5, -1, 0, 1, 5, 25, 32));
+}
+
+TEST(AddRangeTest, NegativeRangesAsymmetric) {
+  std::vector<int> dst;
+  AddRange(&dst, -3, 5, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-3, -2, -1, 0, 1, 2, 4, 5));
+}
+
+TEST(AddRangeTest, NegativeRangesLargeStep) {
+  // Always include -1, 0, 1 when crossing zero.
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 10);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -1, 0, 1, 8));
+}
+
+TEST(AddRangeTest, ZeroOnlyRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0));
+}
+
+TEST(AddRangeTest, ZeroStartingRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0, 1, 2));
+}
+
+TEST(AddRangeTest, NegativeRange64) {
+  std::vector<int64_t> dst;
+  AddRange<int64_t>(&dst, -4, 4, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-4, -2, -1, 0, 1, 2, 4));
+}
+
+TEST(AddRangeTest, NegativeRangePreservesExistingOrder) {
+  // If elements already exist in the range, ensure we don't change
+  // their ordering by adding negative values.
+  std::vector<int64_t> dst = {1, 2, 3};
+  AddRange<int64_t>(&dst, -2, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 3, -2, -1, 0, 1, 2));
+}
+
+TEST(AddRangeTest, FullNegativeRange64) {
+  std::vector<int64_t> dst;
+  const auto min = std::numeric_limits<int64_t>::min();
+  const auto max = std::numeric_limits<int64_t>::max();
+  AddRange(&dst, min, max, 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAreArray(std::vector<int64_t>{
+               min, -1152921504606846976LL, -1125899906842624LL,
+               -1099511627776LL, -1073741824LL, -1048576LL, -1024LL, -1LL, 0LL,
+               1LL, 1024LL, 1048576LL, 1073741824LL, 1099511627776LL,
+               1125899906842624LL, 1152921504606846976LL, max}));
+}
+
+TEST(AddRangeTest, Simple8) {
+  std::vector<int8_t> dst;
+  AddRange<int8_t>(&dst, 1, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/test/benchmark_name_gtest.cc b/test/benchmark_name_gtest.cc
new file mode 100644
index 0000000..afb401c
--- /dev/null
+++ b/test/benchmark_name_gtest.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace benchmark;
+using namespace benchmark::internal;
+
+TEST(BenchmarkNameTest, Empty) {
+  const auto name = BenchmarkName();
+  EXPECT_EQ(name.str(), std::string());
+}
+
+TEST(BenchmarkNameTest, FunctionName) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  EXPECT_EQ(name.str(), "function_name");
+}
+
+TEST(BenchmarkNameTest, FunctionNameAndArgs) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4/5";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/5");
+}
+
+TEST(BenchmarkNameTest, MinTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_time = "min_time:3.4s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
+}
+
+TEST(BenchmarkNameTest, Iterations) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.iterations = "iterations:42";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/iterations:42");
+}
+
+TEST(BenchmarkNameTest, Repetitions) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.repetitions = "repetitions:24";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/repetitions:24");
+}
+
+TEST(BenchmarkNameTest, TimeType) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.time_type = "hammer_time";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/hammer_time");
+}
+
+TEST(BenchmarkNameTest, Threads) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.threads = "threads:256";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/threads:256");
+}
+
+TEST(BenchmarkNameTest, TestEmptyFunctionName) {
+  auto name = BenchmarkName();
+  name.args = "first:3/second:4";
+  name.threads = "threads:22";
+  EXPECT_EQ(name.str(), "first:3/second:4/threads:22");
+}
+
+}  // end namespace
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 66f5956..3cd4f55 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -4,6 +4,7 @@
 #include <math.h>
 #include <stdint.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -13,15 +14,14 @@
 #include <set>
 #include <sstream>
 #include <string>
-#include <vector>
-#include <chrono>
 #include <thread>
 #include <utility>
+#include <vector>
 
 #if defined(__GNUC__)
-# define BENCHMARK_NOINLINE __attribute__((noinline))
+#define BENCHMARK_NOINLINE __attribute__((noinline))
 #else
-# define BENCHMARK_NOINLINE
+#define BENCHMARK_NOINLINE
 #endif
 
 namespace {
@@ -40,10 +40,9 @@ double CalculatePi(int depth) {
   return (pi - 1.0) * 4;
 }
 
-std::set<int> ConstructRandomSet(int size) {
-  std::set<int> s;
-  for (int i = 0; i < size; ++i)
-    s.insert(i);
+std::set<int64_t> ConstructRandomSet(int64_t size) {
+  std::set<int64_t> s;
+  for (int i = 0; i < size; ++i) s.insert(s.end(), i);
   return s;
 }
 
@@ -54,8 +53,7 @@ std::vector<int>* test_vector = nullptr;
 
 static void BM_Factorial(benchmark::State& state) {
   int fac_42 = 0;
-  while (state.KeepRunning())
-    fac_42 = Factorial(8);
+  for (auto _ : state) fac_42 = Factorial(8);
   // Prevent compiler optimizations
   std::stringstream ss;
   ss << fac_42;
@@ -66,8 +64,7 @@ BENCHMARK(BM_Factorial)->UseRealTime();
 
 static void BM_CalculatePiRange(benchmark::State& state) {
   double pi = 0.0;
-  while (state.KeepRunning())
-    pi = CalculatePi(state.range_x());
+  for (auto _ : state) pi = CalculatePi(static_cast<int>(state.range(0)));
   std::stringstream ss;
   ss << pi;
   state.SetLabel(ss.str());
@@ -76,8 +73,8 @@ BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
 
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(CalculatePi(depth));
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
   }
 }
 BENCHMARK(BM_CalculatePi)->Threads(8);
@@ -85,44 +82,48 @@ BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
 BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
 
 static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  std::set<int64_t> data;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::set<int> data = ConstructRandomSet(state.range_x());
+    data = ConstructRandomSet(state.range(0));
     state.ResumeTiming();
-    for (int j = 0; j < state.range_y(); ++j)
-      data.insert(rand());
+    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
-  state.SetItemsProcessed(state.iterations() * state.range_y());
-  state.SetBytesProcessed(state.iterations() * state.range_y() * sizeof(int));
+  state.SetItemsProcessed(state.iterations() * state.range(1));
+  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
 }
-BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
 
-template<typename Container, typename ValueType = typename Container::value_type>
+// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
+// non-timed part of each iteration will make the benchmark take forever.
+BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
+
+template <typename Container,
+          typename ValueType = typename Container::value_type>
 static void BM_Sequential(benchmark::State& state) {
   ValueType v = 42;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     Container c;
-    for (int i = state.range_x(); --i; )
-      c.push_back(v);
+    for (int64_t i = state.range(0); --i;) c.push_back(v);
   }
-  const size_t items_processed = state.iterations() * state.range_x();
+  const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
   state.SetBytesProcessed(items_processed * sizeof(v));
 }
-BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
+    ->Range(1 << 0, 1 << 10);
 BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
 // Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
 #endif
 
 static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range_x(), '-');
-  std::string s2(state.range_x(), '-');
-  while (state.KeepRunning())
-    benchmark::DoNotOptimize(s1.compare(s2));
+  size_t len = static_cast<size_t>(state.range(0));
+  std::string s1(len, '-');
+  std::string s2(len, '-');
+  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
 }
-BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
+BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
 static void BM_SetupTeardown(benchmark::State& state) {
   if (state.thread_index == 0) {
@@ -130,9 +131,9 @@ static void BM_SetupTeardown(benchmark::State& state) {
     test_vector = new std::vector<int>();
   }
   int i = 0;
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     std::lock_guard<std::mutex> l(test_vector_mu);
-    if (i%2 == 0)
+    if (i % 2 == 0)
       test_vector->push_back(i);
     else
       test_vector->pop_back();
@@ -146,24 +147,24 @@ BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
 
 static void BM_LongTest(benchmark::State& state) {
   double tracker = 0.0;
-  while (state.KeepRunning()) {
-    for (int i = 0; i < state.range_x(); ++i)
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i)
       benchmark::DoNotOptimize(tracker += i);
   }
 }
-BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
+BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range_x() / sizeof(int);
-  int thread_size = size / state.threads;
+  int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
+  int thread_size = static_cast<int>(size) / state.threads;
   int from = thread_size * state.thread_index;
   int to = from + thread_size;
 
   if (state.thread_index == 0) {
-    test_vector = new std::vector<int>(size);
+    test_vector = new std::vector<int>(static_cast<size_t>(size));
   }
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     for (int i = from; i < to; i++) {
       // No need to lock test_vector_mu as ranges
       // do not overlap between threads.
@@ -178,22 +179,20 @@ static void BM_ParallelMemset(benchmark::State& state) {
 BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
 
 static void BM_ManualTiming(benchmark::State& state) {
-  size_t slept_for = 0;
-  int microseconds = state.range_x();
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
-
-  while (state.KeepRunning()) {
-    auto start   = std::chrono::high_resolution_clock::now();
+  int64_t slept_for = 0;
+  int64_t microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration{
+      static_cast<double>(microseconds)};
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
     // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(std::chrono::duration_cast<
-      std::chrono::nanoseconds>(sleep_duration));
-    auto end     = std::chrono::high_resolution_clock::now();
+    std::this_thread::sleep_for(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
+    auto end = std::chrono::high_resolution_clock::now();
 
     auto elapsed =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
+        std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
 
     state.SetIterationTime(elapsed.count());
     slept_for += microseconds;
@@ -203,22 +202,44 @@ static void BM_ManualTiming(benchmark::State& state) {
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
 
-#if __cplusplus >= 201103L
+#ifdef BENCHMARK_HAS_CXX11
 
-template <class ...Args>
+template <class... Args>
 void BM_with_args(benchmark::State& state, Args&&...) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
-BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test,
-                  std::string("abc"), std::pair<int, double>(42, 3.8));
+BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
+                  std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
   while(state.KeepRunning()) {}
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
-#endif // __cplusplus >= 201103L
-
-BENCHMARK_MAIN()
+#endif  // BENCHMARK_HAS_CXX11
+
+static void BM_DenseThreadRanges(benchmark::State& st) {
+  switch (st.range(0)) {
+    case 1:
+      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      break;
+    case 2:
+      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      break;
+    case 3:
+      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
+             st.threads == 14);
+      break;
+    default:
+      assert(false && "Invalid test case number");
+  }
+  while (st.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
+BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
+BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
+BENCHMARK_MAIN();
diff --git a/test/clobber_memory_assembly_test.cc b/test/clobber_memory_assembly_test.cc
new file mode 100644
index 0000000..f41911a
--- /dev/null
+++ b/test/clobber_memory_assembly_test.cc
@@ -0,0 +1,64 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+}
+
+// CHECK-LABEL: test_basic:
+extern "C" void test_basic() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = 101;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: movl $101, [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_store:
+extern "C" void test_redundant_store() {
+  ExternInt = 3;
+  benchmark::ClobberMemory();
+  ExternInt = 51;
+  // CHECK-DAG: ExternInt
+  // CHECK-DAG: movl $3
+  // CHECK: movl $51
+}
+
+// CHECK-LABEL: test_redundant_read:
+extern "C" void test_redundant_read() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK-NOT: ExternInt2
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_read2:
+extern "C" void test_redundant_read2() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ExternInt2(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ret
+}
diff --git a/test/commandlineflags_gtest.cc b/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000..656020f
--- /dev/null
+++ b/test/commandlineflags_gtest.cc
@@ -0,0 +1,201 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) {
+  return _putenv_s(name, "");
+}
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "N", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "n", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "NO", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "No", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "no", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "F", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "f", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "FALSE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "False", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "false", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "OFF", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "YES", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "T", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "t", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "TRUE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "True", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "true", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "ON", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "On", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "on", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+#ifndef BENCHMARK_OS_WINDOWS
+  ASSERT_EQ(setenv("IN_ENV", "", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+#endif
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(DoubleFromEnv("not_in_env", 0.51), 0.51);
+}
+
+TEST(DoubleFromEnv, InvalidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.51), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, ValidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "0.51", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.71), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 8ab88f9..5681fdc 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -1,145 +1,52 @@
-
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
-#include "../src/re.h"    // NOTE: re.h is for internal use only
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <utility>
 #include <algorithm>
+#include <cassert>
 #include <cmath>
+#include <cstdlib>
+#include <vector>
+#include "benchmark/benchmark.h"
+#include "output_test.h"
 
 namespace {
 
-// ========================================================================= //
-// -------------------------- Testing Case --------------------------------- //
-// ========================================================================= //
-
-enum MatchRules {
-  MR_Default, // Skip non-matching lines until a match is found.
-  MR_Next    // Match must occur on the next line.
-};
-
-struct TestCase {
-  std::string regex;
-  int match_rule;
-
-  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
-
-  void Check(std::stringstream& remaining_output) const {
-    benchmark::Regex r;
-    std::string err_str;
-    r.Init(regex, &err_str);
-    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
-                           << " got Error: " << err_str;
-
-    std::string line;
-    while (remaining_output.eof() == false) {
-        CHECK(remaining_output.good());
-        std::getline(remaining_output, line);
-        if (r.Match(line)) return;
-        CHECK(match_rule != MR_Next) << "Expected line \"" << line
-                                     << "\" to match regex \"" << regex << "\"";
-    }
-
-    CHECK(remaining_output.eof() == false)
-        << "End of output reached before match for regex \"" << regex
-        << "\" was found";
-  }
-};
-
-std::vector<TestCase> ConsoleOutputTests;
-std::vector<TestCase> JSONOutputTests;
-std::vector<TestCase> CSVOutputTests;
-
-// ========================================================================= //
-// -------------------------- Test Helpers --------------------------------- //
-// ========================================================================= //
-
-class TestReporter : public benchmark::BenchmarkReporter {
-public:
-  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps)  {}
-
-  virtual bool ReportContext(const Context& context) {
-    bool last_ret = false;
-    bool first = true;
-    for (auto rep : reporters_) {
-      bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
-          << "Reports return different values for ReportContext";
-      first = false;
-      last_ret = new_ret;
-    }
-    return last_ret;
-  }
-
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    for (auto rep : reporters_)
-      rep->ReportRuns(report);
-  }
-
-  virtual void Finalize() {
-      for (auto rep : reporters_)
-        rep->Finalize();
-  }
-
-private:
-  std::vector<benchmark::BenchmarkReporter*> reporters_;
-};
-
-
-#define CONCAT2(x, y) x##y
-#define CONCAT(x, y) CONCAT2(x, y)
-
-#define ADD_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
-
-int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
-  for (auto const& TC : v)
-    out->push_back(TC);
-  return 0;
-}
-
-template <class First>
-std::string join(First f) { return f; }
-
-template <class First, class ...Args>
-std::string join(First f, Args&&... args) {
-    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
-}
-
-std::string dec_re = "[0-9]+\\.[0-9]+";
-
 #define ADD_COMPLEXITY_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::vector<TestCase>* console_out, std::vector<TestCase>* json_out,
-                      std::vector<TestCase>* csv_out, std::string big_o_test_name, 
+int AddComplexityTest(std::string test_name, std::string big_o_test_name,
                       std::string rms_test_name, std::string big_o) {
-  std::string big_o_str = dec_re + " " + big_o;
-  AddCases(console_out, {
-    {join("^" + big_o_test_name + "", big_o_str, big_o_str) + "[ ]*$"},
-    {join("^" + rms_test_name + "", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
-  });
-  AddCases(json_out, {
-    {"\"name\": \"" + big_o_test_name + "\",$"},
-    {"\"cpu_coefficient\": [0-9]+,$", MR_Next},
-    {"\"real_coefficient\": [0-9]{1,5},$", MR_Next},
-    {"\"big_o\": \"" + big_o + "\",$", MR_Next},
-    {"\"time_unit\": \"ns\"$", MR_Next},
-    {"}", MR_Next},
-    {"\"name\": \"" + rms_test_name + "\",$"},
-    {"\"rms\": [0-9]+%$", MR_Next},
-    {"}", MR_Next}
-  });
-  AddCases(csv_out, {
-    {"^\"" + big_o_test_name + "\",," + dec_re + "," + dec_re + "," + big_o + ",,,,,$"},
-    {"^\"" + rms_test_name + "\",," + dec_re + "," + dec_re + ",,,,,,$"}
-  });
+  SetSubstitutions({{"%name", test_name},
+                    {"%bigo_name", big_o_test_name},
+                    {"%rms_name", rms_test_name},
+                    {"%bigo_str", "[ ]* %float " + big_o},
+                    {"%bigo", big_o},
+                    {"%rms", "[ ]*[0-9]+ %"}});
+  AddCases(
+      TC_ConsoleOut,
+      {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
+       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
+       {"^%rms_name %rms %rms[ ]*$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
+                        {"\"cpu_coefficient\": %float,$", MR_Next},
+                        {"\"real_coefficient\": %float,$", MR_Next},
+                        {"\"big_o\": \"%bigo\",$", MR_Next},
+                        {"\"time_unit\": \"ns\"$", MR_Next},
+                        {"}", MR_Next},
+                        {"\"name\": \"%rms_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
+                        {"\"rms\": %float$", MR_Next},
+                        {"}", MR_Next}});
+  AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
+                       {"^\"%bigo_name\"", MR_Not},
+                       {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
   return 0;
 }
 
@@ -150,148 +57,157 @@ int AddComplexityTest(std::vector<TestCase>* console_out, std::vector<TestCase>*
 // ========================================================================= //
 
 void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
+    for (int i = 0; i < 1024; ++i) {
+      benchmark::DoNotOptimize(&i);
+    }
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity(benchmark::o1);
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity([](int){return 1.0; });
-BENCHMARK(BM_Complexity_O1) -> Range(1, 1<<18) -> Complexity();
-
-const char* big_o_1_test_name = "BM_Complexity_O1_BigO";
-const char* rms_o_1_test_name = "BM_Complexity_O1_RMS";
-const char* enum_auto_big_o_1 = "\\([0-9]+\\)";
-const char* lambda_big_o_1 = "f\\(N\\)";
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
+
+const char *one_test_name = "BM_Complexity_O1";
+const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
+const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
+const char *enum_big_o_1 = "\\([0-9]+\\)";
+// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
+// deduced.
+// See https://github.com/google/benchmark/issues/272
+const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
+const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_1_test_name, rms_o_1_test_name, enum_auto_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     enum_big_o_1);
+
+// Add auto enum tests
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     auto_big_o_1);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     lambda_big_o_1);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
 // ========================================================================= //
 
-std::vector<int> ConstructRandomVector(int size) {
+std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
-  v.reserve(size);
+  v.reserve(static_cast<int>(size));
   for (int i = 0; i < size; ++i) {
-    v.push_back(rand() % size);
+    v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
 
 void BM_Complexity_O_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range_x());
-  const int item_not_in_vector = state.range_x()*2; // Test worst case scenario (item not in vector)
-  while (state.KeepRunning()) {
-      benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+  auto v = ConstructRandomVector(state.range(0));
+  // Test worst case scenario (item not in vector)
+  const int64_t item_not_in_vector = state.range(0) * 2;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oN);
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) -> double{return n; });
-BENCHMARK(BM_Complexity_O_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
-
-const char* big_o_n_test_name = "BM_Complexity_O_N_BigO";
-const char* rms_o_n_test_name = "BM_Complexity_O_N_RMS";
-const char* enum_auto_big_o_n = "N";
-const char* lambda_big_o_n = "f\\(N\\)";
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oN);
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
+
+const char *n_test_name = "BM_Complexity_O_N";
+const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
+const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char *enum_auto_big_o_n = "N";
+const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     lambda_big_o_n);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //
 
 static void BM_Complexity_O_N_log_N(benchmark::State& state) {
-  auto v = ConstructRandomVector(state.range_x());
-  while (state.KeepRunning()) {
-      std::sort(v.begin(), v.end());
+  auto v = ConstructRandomVector(state.range(0));
+  for (auto _ : state) {
+    std::sort(v.begin(), v.end());
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity(benchmark::oNLogN);
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity([](int n) {return n * std::log2(n); });
-BENCHMARK(BM_Complexity_O_N_log_N) -> RangeMultiplier(2) -> Range(1<<10, 1<<16) -> Complexity();
-
-const char* big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
-const char* rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
-const char* enum_auto_big_o_n_lg_n = "NlgN";
-const char* lambda_big_o_n_lg_n = "f\\(N\\)";
+static const double kLog2E = 1.44269504088896340736;
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oNLogN);
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * n * log(static_cast<double>(n));
+    });
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
+
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
+const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
+const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char *enum_auto_big_o_n_lg_n = "NlgN";
+const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(&ConsoleOutputTests, &JSONOutputTests, &CSVOutputTests, 
-                     big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
-
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
 
 // ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
+// -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
 
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.SetComplexityN(n);
+}
 
-int main(int argc, char* argv[]) {
-  // Add --color_print=false to argv since we don't want to match color codes.
-  char new_arg[64];
-  char* new_argv[64];
-  std::copy(argv, argv + argc, new_argv);
-  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
-  benchmark::Initialize(&argc, new_argv);
-
-  benchmark::ConsoleReporter CR;
-  benchmark::JSONReporter JR;
-  benchmark::CSVReporter CSVR;
-  struct ReporterTest {
-    const char* name;
-    std::vector<TestCase>& output_cases;
-    benchmark::BenchmarkReporter& reporter;
-    std::stringstream out_stream;
-    std::stringstream err_stream;
-
-    ReporterTest(const char* n,
-                 std::vector<TestCase>& out_tc,
-                 benchmark::BenchmarkReporter& br)
-        : name(n), output_cases(out_tc), reporter(br) {
-        reporter.SetOutputStream(&out_stream);
-        reporter.SetErrorStream(&err_stream);
-    }
-  } TestCases[] = {
-      {"ConsoleReporter", ConsoleOutputTests, CR},
-      {"JSONReporter", JSONOutputTests, JR},
-      {"CSVReporter", CSVOutputTests, CSVR}
-  };
-
-  // Create the test reporter and run the benchmarks.
-  std::cout << "Running benchmarks...\n";
-  TestReporter test_rep({&CR, &JR, &CSVR});
-  benchmark::RunSpecifiedBenchmarks(&test_rep);
-
-  for (auto& rep_test : TestCases) {
-      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
-      std::string banner(msg.size() - 1, '-');
-      std::cout << banner << msg << banner << "\n";
+BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->Complexity(benchmark::oN)
+    ->Ranges({{1, 2}, {3, 4}});
 
-      std::cerr << rep_test.err_stream.str();
-      std::cout << rep_test.out_stream.str();
+const std::string complexity_capture_name =
+    "BM_ComplexityCaptureArgs/capture_test";
 
-      for (const auto& TC : rep_test.output_cases)
-        TC.Check(rep_test.out_stream);
+ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
+                     complexity_capture_name + "_RMS", "N");
 
-      std::cout << "\n";
-  }
-  return 0;
-}
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
 
+int main(int argc, char *argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index 56779d6..c4c9a52 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -1,4 +1,5 @@
-
+#undef NDEBUG
+#include <cassert>
 #include <cstddef>
 
 #include "benchmark/benchmark.h"
@@ -7,25 +8,56 @@
 #error C++11 or greater detected. Should be C++03.
 #endif
 
+#ifdef BENCHMARK_HAS_CXX11
+#error C++11 or greater detected by the library. BENCHMARK_HAS_CXX11 is defined.
+#endif
+
 void BM_empty(benchmark::State& state) {
-    while (state.KeepRunning()) {
-        volatile std::size_t x = state.iterations();
-        ((void)x);
-    }
+  while (state.KeepRunning()) {
+    volatile benchmark::IterationCount x = state.iterations();
+    ((void)x);
+  }
 }
 BENCHMARK(BM_empty);
 
+// The new C++11 interface for args/ranges requires initializer list support.
+// Therefore we provide the old interface to support C++03.
+void BM_old_arg_range_interface(benchmark::State& state) {
+  assert((state.range(0) == 1 && state.range(1) == 2) ||
+         (state.range(0) == 5 && state.range(1) == 6));
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
+
 template <class T, class U>
 void BM_template2(benchmark::State& state) {
-    BM_empty(state);
+  BM_empty(state);
 }
 BENCHMARK_TEMPLATE2(BM_template2, int, long);
 
 template <class T>
 void BM_template1(benchmark::State& state) {
-    BM_empty(state);
+  BM_empty(state);
 }
 BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
-BENCHMARK_MAIN()
+template <class T>
+struct BM_Fixture : public ::benchmark::Fixture {
+};
+
+BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
+  BM_empty(state);
+}
+BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
+  BM_empty(state);
+}
+
+void BM_counters(benchmark::State& state) {
+    BM_empty(state);
+    state.counters["Foo"] = 2;
+}
+BENCHMARK(BM_counters);
+
+BENCHMARK_MAIN();
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index 60fa3b1..dd64a33 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -7,10 +7,11 @@
 // NOTE: Users should NOT include or use src/check.h. This is only done in
 // order to test library internals.
 
-#include "benchmark/benchmark_api.h"
-#include "../src/check.h"
-#include <stdexcept>
 #include <cstdlib>
+#include <stdexcept>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
 
 #if defined(__GNUC__) && !defined(__EXCEPTIONS)
 #define TEST_HAS_NO_EXCEPTIONS
@@ -25,17 +26,19 @@ void TestHandler() {
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
-  } catch (std::logic_error const&) {}
+  } catch (std::logic_error const&) {
+  }
   try {
     state.ResumeTiming();
     std::abort();
-  } catch (std::logic_error const&) {}
+  } catch (std::logic_error const&) {
+  }
 #else
-  (void)state; // avoid unused warning
+  (void)state;  // avoid unused warning
 #endif
 }
 
@@ -44,7 +47,7 @@ void BM_diagnostic_test(benchmark::State& state) {
 
   if (called_once == false) try_invalid_pause_resume(state);
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
 
@@ -54,7 +57,23 @@ void BM_diagnostic_test(benchmark::State& state) {
 }
 BENCHMARK(BM_diagnostic_test);
 
-int main(int argc, char** argv) {
+
+void BM_diagnostic_test_keep_running(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  while(state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test_keep_running);
+
+int main(int argc, char* argv[]) {
   benchmark::internal::GetAbortHandler() = &TestHandler;
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
diff --git a/test/display_aggregates_only_test.cc b/test/display_aggregates_only_test.cc
new file mode 100644
index 0000000..3c36d3f
--- /dev/null
+++ b/test/display_aggregates_only_test.cc
@@ -0,0 +1,43 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of DisplayAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find 6 "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/donotoptimize_assembly_test.cc b/test/donotoptimize_assembly_test.cc
new file mode 100644
index 0000000..d4b0bab
--- /dev/null
+++ b/test/donotoptimize_assembly_test.cc
@@ -0,0 +1,163 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+inline int Add42(int x) { return x + 42; }
+
+struct NotTriviallyCopyable {
+  NotTriviallyCopyable();
+  explicit NotTriviallyCopyable(int x) : value(x) {}
+  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  int value;
+};
+
+struct Large {
+  int value;
+  int data[2];
+};
+
+}
+// CHECK-LABEL: test_with_rvalue:
+extern "C" void test_with_rvalue() {
+  benchmark::DoNotOptimize(Add42(0));
+  // CHECK: movl $42, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_rvalue:
+extern "C" void test_with_large_rvalue() {
+  benchmark::DoNotOptimize(Large{ExternInt, {ExternInt, ExternInt}});
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]]
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_rvalue:
+extern "C" void test_with_non_trivial_rvalue() {
+  benchmark::DoNotOptimize(NotTriviallyCopyable(ExternInt));
+  // CHECK: mov{{l|q}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_lvalue:
+extern "C" void test_with_lvalue() {
+  int x = 101;
+  benchmark::DoNotOptimize(x);
+  // CHECK-GNU: movl $101, %eax
+  // CHECK-CLANG: movl $101, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_lvalue:
+extern "C" void test_with_large_lvalue() {
+  Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_lvalue:
+extern "C" void test_with_non_trivial_lvalue() {
+  NotTriviallyCopyable NTC(ExternInt);
+  benchmark::DoNotOptimize(NTC);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_lvalue:
+extern "C" void test_with_const_lvalue() {
+  const int x = 123;
+  benchmark::DoNotOptimize(x);
+  // CHECK: movl $123, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_const_lvalue:
+extern "C" void test_with_large_const_lvalue() {
+  const Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_const_lvalue:
+extern "C" void test_with_non_trivial_const_lvalue() {
+  const NotTriviallyCopyable Obj(ExternInt);
+  benchmark::DoNotOptimize(Obj);
+  // CHECK: mov{{q|l}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_div_by_two:
+extern "C" int test_div_by_two(int input) {
+  int divisor = 2;
+  benchmark::DoNotOptimize(divisor);
+  return input / divisor;
+  // CHECK: movl $2, [[DEST:.*]]
+  // CHECK: idivl [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_inc_integer:
+extern "C" int test_inc_integer() {
+  int x = 0;
+  for (int i=0; i < 5; ++i)
+    benchmark::DoNotOptimize(++x);
+  // CHECK: movl $1, [[DEST:.*]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK-CLANG: movl [[DEST]], %eax
+  // CHECK: ret
+  return x;
+}
+
+// CHECK-LABEL: test_pointer_rvalue
+extern "C" void test_pointer_rvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  benchmark::DoNotOptimize(&x);
+}
+
+// CHECK-LABEL: test_pointer_const_lvalue:
+extern "C" void test_pointer_const_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  int * const xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
+
+// CHECK-LABEL: test_pointer_lvalue:
+extern "C" void test_pointer_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z+]+]])
+  // CHECK: ret
+  int x = 42;
+  int *xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
diff --git a/test/donotoptimize_test.cc b/test/donotoptimize_test.cc
index e4453fb..2ce92d1 100644
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@@ -4,24 +4,37 @@
 
 namespace {
 #if defined(__GNUC__)
-  std::uint64_t double_up(const std::uint64_t x) __attribute__ ((const));
+std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
 #endif
-  std::uint64_t double_up(const std::uint64_t x) {
-    return x * 2;
-  }
+std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
 }
 
-int main(int, char*[]) {
+// Using DoNotOptimize on types like BitRef seem to cause a lot of problems
+// with the inline assembly on both GCC and Clang.
+struct BitRef {
+  int index;
+  unsigned char &byte;
+
+public:
+  static BitRef Make() {
+    static unsigned char arr[2] = {};
+    BitRef b(1, arr[0]);
+    return b;
+  }
+private:
+  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
+};
 
+int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
-  char buffer8[8];
+  char buffer8[8] = "";
   benchmark::DoNotOptimize(buffer8);
 
-  char buffer20[20];
+  char buffer20[20] = "";
   benchmark::DoNotOptimize(buffer20);
 
-  char buffer1024[1024];
+  char buffer1024[1024] = "";
   benchmark::DoNotOptimize(buffer1024);
   benchmark::DoNotOptimize(&buffer1024[0]);
 
@@ -32,5 +45,8 @@ int main(int, char*[]) {
 
   benchmark::DoNotOptimize(double_up(x));
 
-  return 0;
+  // These tests are to e
+  benchmark::DoNotOptimize(BitRef::Make());
+  BitRef lval = BitRef::Make();
+  benchmark::DoNotOptimize(lval);
 }
diff --git a/test/filter_test.cc b/test/filter_test.cc
index 0ba4071..0e27065 100644
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@@ -27,9 +27,7 @@ class TestReporter : public benchmark::ConsoleReporter {
 
   virtual ~TestReporter() {}
 
-  size_t GetCount() const {
-    return count_;
-  }
+  size_t GetCount() const { return count_; }
 
  private:
   mutable size_t count_;
@@ -37,46 +35,47 @@ class TestReporter : public benchmark::ConsoleReporter {
 
 }  // end namespace
 
-
 static void NoPrefix(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(NoPrefix);
 
 static void BM_Foo(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(BM_Foo);
 
-
 static void BM_Bar(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(BM_Bar);
 
-
 static void BM_FooBar(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(BM_FooBar);
 
-
 static void BM_FooBa(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+  for (auto _ : state) {
+  }
 }
 BENCHMARK(BM_FooBa);
 
-
-
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   bool list_only = false;
-  for (int i=0; i < argc; ++i)
-    list_only |= std::string(argv[i]).find("--benchmark_list_tests") != std::string::npos;
+  for (int i = 0; i < argc; ++i)
+    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
+                 std::string::npos;
 
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  const size_t returned_count = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter);
 
   if (argc == 2) {
     // Make sure we ran all of the tests
diff --git a/test/fixture_test.cc b/test/fixture_test.cc
index bf800fd..a331c7d 100644
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@@ -4,7 +4,9 @@
 #include <cassert>
 #include <memory>
 
-class MyFixture : public ::benchmark::Fixture {
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
  public:
   void SetUp(const ::benchmark::State& state) {
     if (state.thread_index == 0) {
@@ -20,33 +22,30 @@ class MyFixture : public ::benchmark::Fixture {
     }
   }
 
-  ~MyFixture() {
-    assert(data == nullptr);
-  }
+  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }
 
   std::unique_ptr<int> data;
 };
 
-
-BENCHMARK_F(MyFixture, Foo)(benchmark::State& st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State &st) {
   assert(data.get() != nullptr);
   assert(*data == 42);
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
   }
 }
 
-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
   if (st.thread_index == 0) {
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
-  while (st.KeepRunning()) {
+  for (auto _ : st) {
     assert(data.get() != nullptr);
     assert(*data == 42);
   }
-  st.SetItemsProcessed(st.range_x());
+  st.SetItemsProcessed(st.range(0));
 }
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42)->ThreadPerCpu();
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/internal_threading_test.cc b/test/internal_threading_test.cc
new file mode 100644
index 0000000..039d7c1
--- /dev/null
+++ b/test/internal_threading_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static const std::chrono::duration<double, std::milli> time_frame(50);
+static const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame)
+      return;
+  }
+}
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_MainThread
+
+void BM_MainThread(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_WorkerThread
+
+void BM_WorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_MainThreadAndWorkerThread
+
+void BM_MainThreadAndWorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    MyBusySpinwait();
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/link_main_test.cc b/test/link_main_test.cc
new file mode 100644
index 0000000..241ad5c
--- /dev/null
+++ b/test/link_main_test.cc
@@ -0,0 +1,8 @@
+#include "benchmark/benchmark.h"
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
diff --git a/test/map_test.cc b/test/map_test.cc
index 5eccf8d..dbf7982 100644
--- a/test/map_test.cc
+++ b/test/map_test.cc
@@ -8,7 +8,7 @@ namespace {
 std::map<int, int> ConstructRandomMap(int size) {
   std::map<int, int> m;
   for (int i = 0; i < size; ++i) {
-    m.insert(std::make_pair(rand() % size, rand() % size));
+    m.insert(std::make_pair(std::rand() % size, std::rand() % size));
   }
   return m;
 }
@@ -17,13 +17,14 @@ std::map<int, int> ConstructRandomMap(int size) {
 
 // Basic version.
 static void BM_MapLookup(benchmark::State& state) {
-  const int size = state.range_x();
-  while (state.KeepRunning()) {
+  const int size = static_cast<int>(state.range(0));
+  std::map<int, int> m;
+  for (auto _ : state) {
     state.PauseTiming();
-    std::map<int, int> m = ConstructRandomMap(size);
+    m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
@@ -34,25 +35,23 @@ BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
 class MapFixture : public ::benchmark::Fixture {
  public:
   void SetUp(const ::benchmark::State& st) {
-    m = ConstructRandomMap(st.range_x());
+    m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) {
-    m.clear();
-  }
+  void TearDown(const ::benchmark::State&) { m.clear(); }
 
   std::map<int, int> m;
 };
 
 BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
-  const int size = state.range_x();
-  while (state.KeepRunning()) {
+  const int size = static_cast<int>(state.range(0));
+  for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(rand() % size));
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
 }
-BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1<<3, 1<<12);
+BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12);
 
-BENCHMARK_MAIN()
+BENCHMARK_MAIN();
diff --git a/test/memory_manager_test.cc b/test/memory_manager_test.cc
new file mode 100644
index 0000000..90bed16
--- /dev/null
+++ b/test/memory_manager_test.cc
@@ -0,0 +1,44 @@
+#include <memory>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+class TestMemoryManager : public benchmark::MemoryManager {
+  void Start() {}
+  void Stop(Result* result) {
+    result->num_allocs = 42;
+    result->max_bytes_used = 42000;
+  }
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"allocs_per_iter\": %float,$", MR_Next},
+                       {"\"max_bytes_used\": 42000$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+
+int main(int argc, char* argv[]) {
+  std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
+
+  benchmark::RegisterMemoryManager(mm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterMemoryManager(nullptr);
+}
diff --git a/test/multiple_ranges_test.cc b/test/multiple_ranges_test.cc
new file mode 100644
index 0000000..b25f40e
--- /dev/null
+++ b/test/multiple_ranges_test.cc
@@ -0,0 +1,96 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+class MultipleRangesFixture : public ::benchmark::Fixture {
+ public:
+  MultipleRangesFixture()
+      : expectedValues({{1, 3, 5},
+                        {1, 3, 8},
+                        {1, 3, 15},
+                        {2, 3, 5},
+                        {2, 3, 8},
+                        {2, 3, 15},
+                        {1, 4, 5},
+                        {1, 4, 8},
+                        {1, 4, 15},
+                        {2, 4, 5},
+                        {2, 4, 8},
+                        {2, 4, 15},
+                        {1, 7, 5},
+                        {1, 7, 8},
+                        {1, 7, 15},
+                        {2, 7, 5},
+                        {2, 7, 8},
+                        {2, 7, 15},
+                        {7, 6, 3}}) {}
+
+  void SetUp(const ::benchmark::State& state) {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  virtual ~MultipleRangesFixture() {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product = state.range(0) * state.range(1) * state.range(2);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(MultipleRangesFixture, Empty)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 2}, {3, 7}, {5, 15}})
+    ->Args({7, 6, 3});
+
+void BM_CheckDefaultArgument(benchmark::State& state) {
+  // Test that the 'range()' without an argument is the same as 'range(0)'.
+  assert(state.range() == state.range(0));
+  assert(state.range() != state.range(1));
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
+
+static void BM_MultipleRanges(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
+
+BENCHMARK_MAIN();
diff --git a/test/options_test.cc b/test/options_test.cc
index 78cedae..9f9a786 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,19 +1,22 @@
-#include "benchmark/benchmark_api.h"
-
+#include "benchmark/benchmark.h"
 #include <chrono>
 #include <thread>
 
+#if defined(NDEBUG)
+#undef NDEBUG
+#endif
+#include <cassert>
+
 void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
   }
 }
 
 void BM_basic_slow(benchmark::State& state) {
-  std::chrono::milliseconds sleep_duration(state.range_x());
-  while (state.KeepRunning()) {
+  std::chrono::milliseconds sleep_duration(state.range(0));
+  for (auto _ : state) {
     std::this_thread::sleep_for(
-      std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration)
-      );
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
   }
 }
 
@@ -22,16 +25,27 @@ BENCHMARK(BM_basic)->Arg(42);
 BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
 BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kSecond);
 BENCHMARK(BM_basic)->Range(1, 8);
 BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
-BENCHMARK(BM_basic)->ArgPair(42, 42);
-BENCHMARK(BM_basic)->RangePair(64, 512, 64, 512);
+BENCHMARK(BM_basic)->Args({42, 42});
+BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
 BENCHMARK(BM_basic)->MinTime(0.7);
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
 BENCHMARK(BM_basic)->Repetitions(3);
+BENCHMARK(BM_basic)
+    ->RangeMultiplier(std::numeric_limits<int>::max())
+    ->Range(std::numeric_limits<int64_t>::min(),
+            std::numeric_limits<int64_t>::max());
+
+// Negative ranges
+BENCHMARK(BM_basic)->Range(-64, -1);
+BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
+BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
+BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
 
 void CustomArgs(benchmark::internal::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {
@@ -41,4 +55,22 @@ void CustomArgs(benchmark::internal::Benchmark* b) {
 
 BENCHMARK(BM_basic)->Apply(CustomArgs);
 
-BENCHMARK_MAIN()
+void BM_explicit_iteration_count(benchmark::State& state) {
+  // Test that benchmarks specified with an explicit iteration count are
+  // only run once.
+  static bool invoked_before = false;
+  assert(!invoked_before);
+  invoked_before = true;
+
+  // Test that the requested iteration count is respected.
+  assert(state.max_iterations == 42);
+  size_t actual_iterations = 0;
+  for (auto _ : state)
+    ++actual_iterations;
+  assert(state.iterations() == state.max_iterations);
+  assert(state.iterations() == 42);
+
+}
+BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+
+BENCHMARK_MAIN();
diff --git a/test/output_test.h b/test/output_test.h
new file mode 100644
index 0000000..9385761
--- /dev/null
+++ b/test/output_test.h
@@ -0,0 +1,213 @@
+#ifndef TEST_OUTPUT_TEST_H
+#define TEST_OUTPUT_TEST_H
+
+#undef NDEBUG
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../src/re.h"
+#include "benchmark/benchmark.h"
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
+
+#define SET_SUBSTITUTIONS(...) \
+  int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
+
+enum MatchRules {
+  MR_Default,  // Skip non-matching lines until a match is found.
+  MR_Next,     // Match must occur on the next line.
+  MR_Not  // No line between the current position and the next match matches
+          // the regex
+};
+
+struct TestCase {
+  TestCase(std::string re, int rule = MR_Default);
+
+  std::string regex_str;
+  int match_rule;
+  std::string substituted_regex;
+  std::shared_ptr<benchmark::Regex> regex;
+};
+
+enum TestCaseID {
+  TC_ConsoleOut,
+  TC_ConsoleErr,
+  TC_JSONOut,
+  TC_JSONErr,
+  TC_CSVOut,
+  TC_CSVErr,
+
+  TC_NumID  // PRIVATE
+};
+
+// Add a list of test cases to be run against the output specified by
+// 'ID'
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il);
+
+// Add or set a list of substitutions to be performed on constructed regex's
+// See 'output_test_helper.cc' for a list of default substitutions.
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il);
+
+// Run all output tests.
+void RunOutputTests(int argc, char* argv[]);
+
+// Count the number of 'pat' substrings in the 'haystack' string.
+int SubstrCnt(const std::string& haystack, const std::string& pat);
+
+// Run registered benchmarks with file reporter enabled, and return the content
+// outputted by the file reporter.
+std::string GetFileReporterOutput(int argc, char* argv[]);
+
+// ========================================================================= //
+// ------------------------- Results checking ------------------------------ //
+// ========================================================================= //
+
+// Call this macro to register a benchmark for checking its results. This
+// should be all that's needed. It subscribes a function to check the (CSV)
+// results of a benchmark. This is done only after verifying that the output
+// strings are really as expected.
+// bm_name_pattern: a name or a regex pattern which will be matched against
+//                  all the benchmark names. Matching benchmarks
+//                  will be the subject of a call to checker_function
+// checker_function: should be of type ResultsCheckFn (see below)
+#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
+  size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+
+struct Results;
+typedef std::function<void(Results const&)> ResultsCheckFn;
+
+size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+
+// Class holding the results of a benchmark.
+// It is passed in calls to checker functions.
+struct Results {
+  // the benchmark name
+  std::string name;
+  // the benchmark fields
+  std::map<std::string, std::string> values;
+
+  Results(const std::string& n) : name(n) {}
+
+  int NumThreads() const;
+
+  double NumIterations() const;
+
+  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
+
+  // get cpu_time or real_time in seconds
+  double GetTime(BenchmarkTime which) const;
+
+  // get the real_time duration of the benchmark in seconds.
+  // it is better to use fuzzy float checks for this, as the float
+  // ASCII formatting is lossy.
+  double DurationRealTime() const {
+    return NumIterations() * GetTime(kRealTime);
+  }
+  // get the cpu_time duration of the benchmark in seconds
+  double DurationCPUTime() const {
+    return NumIterations() * GetTime(kCpuTime);
+  }
+
+  // get the string for a result by name, or nullptr if the name
+  // is not found
+  const std::string* Get(const char* entry_name) const {
+    auto it = values.find(entry_name);
+    if (it == values.end()) return nullptr;
+    return &it->second;
+  }
+
+  // get a result by name, parsed as a specific type.
+  // NOTE: for counters, use GetCounterAs instead.
+  template <class T>
+  T GetAs(const char* entry_name) const;
+
+  // counters are written as doubles, so they have to be read first
+  // as a double, and only then converted to the asked type.
+  template <class T>
+  T GetCounterAs(const char* entry_name) const {
+    double dval = GetAs<double>(entry_name);
+    T tval = static_cast<T>(dval);
+    return tval;
+  }
+};
+
+template <class T>
+T Results::GetAs(const char* entry_name) const {
+  auto* sv = Get(entry_name);
+  CHECK(sv != nullptr && !sv->empty());
+  std::stringstream ss;
+  ss << *sv;
+  T out;
+  ss >> out;
+  CHECK(!ss.fail());
+  return out;
+}
+
+//----------------------------------
+// Macros to help in result checking. Do not use them with arguments causing
+// side-effects.
+
+// clang-format off
+
+#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(CHECK_, relationship)                                        \
+    (entry.getfn< var_type >(var_name), (value)) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"
+
+// check with tolerance. eps_factor is the tolerance window, which is
+// interpreted relative to value (eg, 0.1 means 10% of value).
+#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(CHECK_FLOAT_, relationship)                                  \
+    (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "with tolerance of " << (eps_factor) * (value)                   \
+    << " (" << (eps_factor)*100. << "%), "                              \
+    << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \
+    << " (" << (((entry).getfn< var_type >(var_name) - (value))         \
+               /                                                        \
+               ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \
+    << "%)"
+
+#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+
+#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+
+#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+
+#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+
+// clang-format on
+
+// ========================================================================= //
+// --------------------------- Misc Utilities ------------------------------ //
+// ========================================================================= //
+
+namespace {
+
+const char* const dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+
+}  //  end namespace
+
+#endif  // TEST_OUTPUT_TEST_H
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
new file mode 100644
index 0000000..1aebc55
--- /dev/null
+++ b/test/output_test_helper.cc
@@ -0,0 +1,520 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <streambuf>
+
+#include "../src/benchmark_api_internal.h"
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/re.h"     // NOTE: re.h is for internal use only
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------------ Internals -------------------------------- //
+// ========================================================================= //
+namespace internal {
+namespace {
+
+using TestCaseList = std::vector<TestCase>;
+
+// Use a vector because the order elements are added matters during iteration.
+// std::map/unordered_map don't guarantee that.
+// For example:
+//  SetSubstitutions({{"%HelloWorld", "Hello"}, {"%Hello", "Hi"}});
+//     Substitute("%HelloWorld") // Always expands to Hello.
+using SubMap = std::vector<std::pair<std::string, std::string>>;
+
+TestCaseList& GetTestCaseList(TestCaseID ID) {
+  // Uses function-local statics to ensure initialization occurs
+  // before first use.
+  static TestCaseList lists[TC_NumID];
+  return lists[ID];
+}
+
+SubMap& GetSubstitutions() {
+  // Don't use 'dec_re' from header because it may not yet be initialized.
+  // clang-format off
+  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string time_re = "([0-9]+[.])?[0-9]+";
+  static SubMap map = {
+      {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
+      // human-readable float
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%int", "[ ]*[0-9]+"},
+      {" %s ", "[ ]+"},
+      {"%time", "[ ]*" + time_re + "[ ]+ns"},
+      {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
+      {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
+      {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
+      {"%csv_header",
+       "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
+       "items_per_second,label,error_occurred,error_message"},
+      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
+      {"%csv_ms_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ms,,,,,"},
+      {"%csv_s_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",s,,,,,"},
+      {"%csv_bytes_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
+      {"%csv_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
+      {"%csv_bytes_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re +
+       "," + safe_dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
+      {"%csv_label_report_end", ",,"}};
+  // clang-format on
+  return map;
+}
+
+std::string PerformSubstitutions(std::string source) {
+  SubMap const& subs = GetSubstitutions();
+  using SizeT = std::string::size_type;
+  for (auto const& KV : subs) {
+    SizeT pos;
+    SizeT next_start = 0;
+    while ((pos = source.find(KV.first, next_start)) != std::string::npos) {
+      next_start = pos + KV.second.size();
+      source.replace(pos, KV.first.size(), KV.second);
+    }
+  }
+  return source;
+}
+
+void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
+               TestCaseList const& not_checks) {
+  std::string first_line;
+  bool on_first = true;
+  std::string line;
+  while (remaining_output.eof() == false) {
+    CHECK(remaining_output.good());
+    std::getline(remaining_output, line);
+    if (on_first) {
+      first_line = line;
+      on_first = false;
+    }
+    for (const auto& NC : not_checks) {
+      CHECK(!NC.regex->Match(line))
+          << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
+          << NC.regex_str << "\""
+          << "\n    actual regex string \"" << TC.substituted_regex << "\""
+          << "\n    started matching near: " << first_line;
+    }
+    if (TC.regex->Match(line)) return;
+    CHECK(TC.match_rule != MR_Next)
+        << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
+        << "\""
+        << "\n    actual regex string \"" << TC.substituted_regex << "\""
+        << "\n    started matching near: " << first_line;
+  }
+  CHECK(remaining_output.eof() == false)
+      << "End of output reached before match for regex \"" << TC.regex_str
+      << "\" was found"
+      << "\n    actual regex string \"" << TC.substituted_regex << "\""
+      << "\n    started matching near: " << first_line;
+}
+
+void CheckCases(TestCaseList const& checks, std::stringstream& output) {
+  std::vector<TestCase> not_checks;
+  for (size_t i = 0; i < checks.size(); ++i) {
+    const auto& TC = checks[i];
+    if (TC.match_rule == MR_Not) {
+      not_checks.push_back(TC);
+      continue;
+    }
+    CheckCase(output, TC, not_checks);
+    not_checks.clear();
+  }
+}
+
+class TestReporter : public benchmark::BenchmarkReporter {
+ public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps) {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    (void)first;
+    return last_ret;
+  }
+
+  void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_) rep->ReportRuns(report);
+  }
+  void Finalize() {
+    for (auto rep : reporters_) rep->Finalize();
+  }
+
+ private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+}  // namespace
+
+}  // end namespace internal
+
+// ========================================================================= //
+// -------------------------- Results checking ----------------------------- //
+// ========================================================================= //
+
+namespace internal {
+
+// Utility class to manage subscribers for checking benchmark results.
+// It works by parsing the CSV output to read the results.
+class ResultsChecker {
+ public:
+  struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
+    PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
+        : TestCase(rx), fn(fn_) {}
+    ResultsCheckFn fn;
+  };
+
+  std::vector<PatternAndFn> check_patterns;
+  std::vector<Results> results;
+  std::vector<std::string> field_names;
+
+  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+
+  void CheckResults(std::stringstream& output);
+
+ private:
+  void SetHeader_(const std::string& csv_header);
+  void SetValues_(const std::string& entry_csv_line);
+
+  std::vector<std::string> SplitCsv_(const std::string& line);
+};
+
+// store the static ResultsChecker in a function to prevent initialization
+// order problems
+ResultsChecker& GetResultsChecker() {
+  static ResultsChecker rc;
+  return rc;
+}
+
+// add a results checker for a benchmark
+void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+  check_patterns.emplace_back(entry_pattern, fn);
+}
+
+// check the results of all subscribed benchmarks
+void ResultsChecker::CheckResults(std::stringstream& output) {
+  // first reset the stream to the start
+  {
+    auto start = std::stringstream::pos_type(0);
+    // clear before calling tellg()
+    output.clear();
+    // seek to zero only when needed
+    if (output.tellg() > start) output.seekg(start);
+    // and just in case
+    output.clear();
+  }
+  // now go over every line and publish it to the ResultsChecker
+  std::string line;
+  bool on_first = true;
+  while (output.eof() == false) {
+    CHECK(output.good());
+    std::getline(output, line);
+    if (on_first) {
+      SetHeader_(line);  // this is important
+      on_first = false;
+      continue;
+    }
+    SetValues_(line);
+  }
+  // finally we can call the subscribed check functions
+  for (const auto& p : check_patterns) {
+    VLOG(2) << "--------------------------------\n";
+    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    for (const auto& r : results) {
+      if (!p.regex->Match(r.name)) {
+        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        continue;
+      } else {
+        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      }
+      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      p.fn(r);
+      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+    }
+  }
+}
+
+// prepare for the names in this header
+void ResultsChecker::SetHeader_(const std::string& csv_header) {
+  field_names = SplitCsv_(csv_header);
+}
+
+// set the values for a benchmark
+void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
+  if (entry_csv_line.empty()) return;  // some lines are empty
+  CHECK(!field_names.empty());
+  auto vals = SplitCsv_(entry_csv_line);
+  CHECK_EQ(vals.size(), field_names.size());
+  results.emplace_back(vals[0]);  // vals[0] is the benchmark name
+  auto& entry = results.back();
+  for (size_t i = 1, e = vals.size(); i < e; ++i) {
+    entry.values[field_names[i]] = vals[i];
+  }
+}
+
+// a quick'n'dirty csv splitter (eliminating quotes)
+std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector<std::string> out;
+  if (line.empty()) return out;
+  if (!field_names.empty()) out.reserve(field_names.size());
+  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
+  while (pos != line.npos) {
+    CHECK(curr > 0);
+    if (line[prev] == '"') ++prev;
+    if (line[curr - 1] == '"') --curr;
+    out.push_back(line.substr(prev, curr - prev));
+    prev = pos + 1;
+    pos = line.find_first_of(',', pos + 1);
+    curr = pos;
+  }
+  curr = line.size();
+  if (line[prev] == '"') ++prev;
+  if (line[curr - 1] == '"') --curr;
+  out.push_back(line.substr(prev, curr - prev));
+  return out;
+}
+
+}  // end namespace internal
+
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+  auto& rc = internal::GetResultsChecker();
+  rc.Add(bm_name, fn);
+  return rc.results.size();
+}
+
+int Results::NumThreads() const {
+  auto pos = name.find("/threads:");
+  if (pos == name.npos) return 1;
+  auto end = name.find('/', pos + 9);
+  std::stringstream ss;
+  ss << name.substr(pos + 9, end);
+  int num = 1;
+  ss >> num;
+  CHECK(!ss.fail());
+  return num;
+}
+
+double Results::NumIterations() const {
+  return GetAs<double>("iterations");
+}
+
+double Results::GetTime(BenchmarkTime which) const {
+  CHECK(which == kCpuTime || which == kRealTime);
+  const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs<double>(which_str);
+  auto unit = Get("time_unit");
+  CHECK(unit);
+  if (*unit == "ns") {
+    return val * 1.e-9;
+  } else if (*unit == "us") {
+    return val * 1.e-6;
+  } else if (*unit == "ms") {
+    return val * 1.e-3;
+  } else if (*unit == "s") {
+    return val;
+  } else {
+    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    return 0;
+  }
+}
+
+// ========================================================================= //
+// -------------------------- Public API Definitions------------------------ //
+// ========================================================================= //
+
+TestCase::TestCase(std::string re, int rule)
+    : regex_str(std::move(re)),
+      match_rule(rule),
+      substituted_regex(internal::PerformSubstitutions(regex_str)),
+      regex(std::make_shared<benchmark::Regex>()) {
+  std::string err_str;
+  regex->Init(substituted_regex, &err_str);
+  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
+                         << "\""
+                         << "\n    originally \"" << regex_str << "\""
+                         << "\n    got error: " << err_str;
+}
+
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
+  auto& L = internal::GetTestCaseList(ID);
+  L.insert(L.end(), il);
+  return 0;
+}
+
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il) {
+  auto& subs = internal::GetSubstitutions();
+  for (auto KV : il) {
+    bool exists = false;
+    KV.second = internal::PerformSubstitutions(KV.second);
+    for (auto& EKV : subs) {
+      if (EKV.first == KV.first) {
+        EKV.second = std::move(KV.second);
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) subs.push_back(std::move(KV));
+  }
+  return 0;
+}
+
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+void RunOutputTests(int argc, char* argv[]) {
+  using internal::GetTestCaseList;
+  benchmark::Initialize(&argc, argv);
+  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/ true);
+  benchmark::ConsoleReporter CR(options);
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    std::vector<TestCase>& error_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+                 std::vector<TestCase>& err_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
+      reporter.SetOutputStream(&out_stream);
+      reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+       GetTestCaseList(TC_ConsoleErr), CR},
+      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
+       JR},
+      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
+       CSVR},
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  internal::TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string banner(msg.size() - 1, '-');
+    std::cout << banner << msg << banner << "\n";
+
+    std::cerr << rep_test.err_stream.str();
+    std::cout << rep_test.out_stream.str();
+
+    internal::CheckCases(rep_test.error_cases, rep_test.err_stream);
+    internal::CheckCases(rep_test.output_cases, rep_test.out_stream);
+
+    std::cout << "\n";
+  }
+
+  // now that we know the output is as expected, we can dispatch
+  // the checks to subscribees.
+  auto& csv = TestCases[2];
+  // would use == but gcc spits a warning
+  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  internal::GetResultsChecker().CheckResults(csv.out_stream);
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+int SubstrCnt(const std::string& haystack, const std::string& pat) {
+  if (pat.length() == 0) return 0;
+  int count = 0;
+  for (size_t offset = haystack.find(pat); offset != std::string::npos;
+       offset = haystack.find(pat, offset + pat.length()))
+    ++count;
+  return count;
+}
+
+static char ToHex(int ch) {
+  return ch < 10 ? static_cast<char>('0' + ch)
+                 : static_cast<char>('a' + (ch - 10));
+}
+
+static char RandomHexChar() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<int> mrand{0, 15};
+  return ToHex(mrand(rd));
+}
+
+static std::string GetRandomFileName() {
+  std::string model = "test.%%%%%%";
+  for (auto & ch :  model) {
+    if (ch == '%')
+      ch = RandomHexChar();
+  }
+  return model;
+}
+
+static bool FileExists(std::string const& name) {
+  std::ifstream in(name.c_str());
+  return in.good();
+}
+
+static std::string GetTempFileName() {
+  // This function attempts to avoid race conditions where two tests
+  // create the same file at the same time. However, it still introduces races
+  // similar to tmpnam.
+  int retries = 3;
+  while (--retries) {
+    std::string name = GetRandomFileName();
+    if (!FileExists(name))
+      return name;
+  }
+  std::cerr << "Failed to create unique temporary file name" << std::endl;
+  std::abort();
+}
+
+std::string GetFileReporterOutput(int argc, char* argv[]) {
+  std::vector<char*> new_argv(argv, argv + argc);
+  assert(static_cast<decltype(new_argv)::size_type>(argc) == new_argv.size());
+
+  std::string tmp_file_name = GetTempFileName();
+  std::cout << "Will be using this as the tmp file: " << tmp_file_name << '\n';
+
+  std::string tmp = "--benchmark_out=";
+  tmp += tmp_file_name;
+  new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
+
+  argc = int(new_argv.size());
+
+  benchmark::Initialize(&argc, new_argv.data());
+  benchmark::RunSpecifiedBenchmarks();
+
+  // Read the output back from the file, and delete the file.
+  std::ifstream tmp_stream(tmp_file_name);
+  std::string output = std::string((std::istreambuf_iterator<char>(tmp_stream)),
+                                   std::istreambuf_iterator<char>());
+  std::remove(tmp_file_name.c_str());
+
+  return output;
+}
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
new file mode 100644
index 0000000..3ac5b21
--- /dev/null
+++ b/test/register_benchmark_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+#include <cassert>
+#include <vector>
+
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  const char* label;
+  // Note: not explicit as we rely on it being converted through ADD_CASES.
+  TestCase(const char* xname) : TestCase(xname, nullptr) {}
+  TestCase(const char* xname, const char* xlabel)
+      : name(xname), label(xlabel) {}
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    // clang-format off
+    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+                                      << run.benchmark_name();
+    if (label) {
+      CHECK(run.report_label == label) << "expected " << label << " got "
+                                       << run.report_label;
+    } else {
+      CHECK(run.report_label == "");
+    }
+    // clang-format on
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(std::initializer_list<TestCase> const& v) {
+  for (auto N : v) {
+    ExpectedResults.push_back(N);
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
+
+}  // end namespace
+
+typedef benchmark::internal::Benchmark* ReturnVal;
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with no additional arguments
+//----------------------------------------------------------------------------//
+void BM_function(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_function);
+ReturnVal dummy = benchmark::RegisterBenchmark(
+    "BM_function_manual_registration", BM_function);
+ADD_CASES({"BM_function"}, {"BM_function_manual_registration"});
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with additional arguments
+// Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they
+//       reject the variadic pack expansion of lambda captures.
+//----------------------------------------------------------------------------//
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+void BM_extra_args(benchmark::State& st, const char* label) {
+  for (auto _ : st) {
+  }
+  st.SetLabel(label);
+}
+int RegisterFromFunction() {
+  std::pair<const char*, const char*> cases[] = {
+      {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}};
+  for (auto const& c : cases)
+    benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second);
+  return 0;
+}
+int dummy2 = RegisterFromFunction();
+ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
+
+#endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with different callable types
+//----------------------------------------------------------------------------//
+
+struct CustomFixture {
+  void operator()(benchmark::State& st) {
+    for (auto _ : st) {
+    }
+  }
+};
+
+void TestRegistrationAtRuntime() {
+#ifdef BENCHMARK_HAS_CXX11
+  {
+    CustomFixture fx;
+    benchmark::RegisterBenchmark("custom_fixture", fx);
+    AddCases({"custom_fixture"});
+  }
+#endif
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+  {
+    const char* x = "42";
+    auto capturing_lam = [=](benchmark::State& st) {
+      for (auto _ : st) {
+      }
+      st.SetLabel(x);
+    };
+    benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
+    AddCases({{"lambda_benchmark", x}});
+  }
+#endif
+}
+
+// Test that all benchmarks, registered at either during static init or runtime,
+// are run and the results are passed to the reported.
+void RunTestOne() {
+  TestRegistrationAtRuntime();
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+}
+
+// Test that ClearRegisteredBenchmarks() clears all previously registered
+// benchmarks.
+// Also test that new benchmarks can be registered and ran afterwards.
+void RunTestTwo() {
+  assert(ExpectedResults.size() != 0 &&
+         "must have at least one registered benchmark");
+  ExpectedResults.clear();
+  benchmark::ClearRegisteredBenchmarks();
+
+  TestReporter test_reporter;
+  size_t num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == 0);
+  assert(test_reporter.all_runs_.begin() == test_reporter.all_runs_.end());
+
+  TestRegistrationAtRuntime();
+  num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == ExpectedResults.size());
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+}
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  RunTestOne();
+  RunTestTwo();
+}
diff --git a/test/report_aggregates_only_test.cc b/test/report_aggregates_only_test.cc
new file mode 100644
index 0000000..9646b9b
--- /dev/null
+++ b/test/report_aggregates_only_test.cc
@@ -0,0 +1,39 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of ReportAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find three "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index b3898ac..d24a57d 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -1,259 +1,834 @@
 
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
-#include "../src/re.h" // NOTE: re.h is for internal use only
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <vector>
 #include <utility>
 
-namespace {
+#include "benchmark/benchmark.h"
+#include "output_test.h"
 
 // ========================================================================= //
-// -------------------------- Testing Case --------------------------------- //
+// ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
-enum MatchRules {
-  MR_Default, // Skip non-matching lines until a match is found.
-  MR_Next    // Match must occur on the next line.
-};
-
-struct TestCase {
-  std::string regex;
-  int match_rule;
-
-  TestCase(std::string re, int rule = MR_Default) : regex(re), match_rule(rule) {}
-
-  void Check(std::stringstream& remaining_output) const {
-    benchmark::Regex r;
-    std::string err_str;
-    r.Init(regex, &err_str);
-    CHECK(err_str.empty()) << "Could not construct regex \"" << regex << "\""
-                           << " got Error: " << err_str;
-
-    std::string line;
-    while (remaining_output.eof() == false) {
-        CHECK(remaining_output.good());
-        std::getline(remaining_output, line);
-        if (r.Match(line)) return;
-        CHECK(match_rule != MR_Next) << "Expected line \"" << line
-                                     << "\" to match regex \"" << regex << "\"";
-    }
-
-    CHECK(remaining_output.eof() == false)
-        << "End of output reached before match for regex \"" << regex
-        << "\" was found";
+ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
+                          {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+                          {"^[-]+$", MR_Next}});
+static int AddContextCases() {
+  AddCases(TC_ConsoleErr,
+           {
+               {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
+               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
+           });
+  AddCases(TC_JSONOut,
+           {{"^\\{", MR_Default},
+            {"\"context\":", MR_Next},
+            {"\"date\": \"", MR_Next},
+            {"\"host_name\":", MR_Next},
+            {"\"executable\": \".*(/|\\\\)reporter_output_test(\\.exe)?\",",
+             MR_Next},
+            {"\"num_cpus\": %int,$", MR_Next},
+            {"\"mhz_per_cpu\": %float,$", MR_Next},
+            {"\"caches\": \\[$", MR_Default}});
+  auto const& Info = benchmark::CPUInfo::Get();
+  auto const& Caches = Info.caches;
+  if (!Caches.empty()) {
+    AddCases(TC_ConsoleErr, {{"CPU Caches:$", MR_Next}});
   }
-};
-
-std::vector<TestCase> ConsoleOutputTests;
-std::vector<TestCase> JSONOutputTests;
-std::vector<TestCase> CSVOutputTests;
-
-std::vector<TestCase> ConsoleErrorTests;
-std::vector<TestCase> JSONErrorTests;
-std::vector<TestCase> CSVErrorTests;
+  for (size_t I = 0; I < Caches.size(); ++I) {
+    std::string num_caches_str =
+        Caches[I].num_sharing != 0 ? " \\(x%int\\)$" : "$";
+    AddCases(TC_ConsoleErr,
+             {{"L%int (Data|Instruction|Unified) %int KiB" + num_caches_str,
+               MR_Next}});
+    AddCases(TC_JSONOut, {{"\\{$", MR_Next},
+                          {"\"type\": \"", MR_Next},
+                          {"\"level\": %int,$", MR_Next},
+                          {"\"size\": %int,$", MR_Next},
+                          {"\"num_sharing\": %int$", MR_Next},
+                          {"}[,]{0,1}$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"],$"}});
+  auto const& LoadAvg = Info.load_avg;
+  if (!LoadAvg.empty()) {
+    AddCases(TC_ConsoleErr,
+             {{"Load Average: (%float, ){0,2}%float$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"\"load_avg\": \\[(%float,?){0,3}],$", MR_Next}});
+  return 0;
+}
+int dummy_register = AddContextCases();
+ADD_CASES(TC_CSVOut, {{"%csv_header"}});
 
 // ========================================================================= //
-// -------------------------- Test Helpers --------------------------------- //
+// ------------------------ Testing Basic Output --------------------------- //
 // ========================================================================= //
 
-class TestReporter : public benchmark::BenchmarkReporter {
-public:
-  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps)  {}
-
-  virtual bool ReportContext(const Context& context) {
-    bool last_ret = false;
-    bool first = true;
-    for (auto rep : reporters_) {
-      bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
-          << "Reports return different values for ReportContext";
-      first = false;
-      last_ret = new_ret;
-    }
-    return last_ret;
+void BM_basic(benchmark::State& state) {
+  for (auto _ : state) {
   }
+}
+BENCHMARK(BM_basic);
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
-    for (auto rep : reporters_)
-      rep->ReportRuns(report);
-  }
+ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"run_name\": \"BM_basic\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
 
-  virtual void Finalize() {
-      for (auto rep : reporters_)
-        rep->Finalize();
-  }
+// ========================================================================= //
+// ------------------------ Testing Bytes per Second Output ---------------- //
+// ========================================================================= //
 
-private:
-  std::vector<benchmark::BenchmarkReporter*> reporters_;
-};
+void BM_bytes_per_second(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.SetBytesProcessed(1);
+}
+BENCHMARK(BM_bytes_per_second);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
+                           "bytes_per_second=%float[kM]{0,1}/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bytes_per_second\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Items per Second Output ---------------- //
+// ========================================================================= //
 
-#define CONCAT2(x, y) x##y
-#define CONCAT(x, y) CONCAT2(x, y)
+void BM_items_per_second(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.SetItemsProcessed(1);
+}
+BENCHMARK(BM_items_per_second);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
+                           "items_per_second=%float[kM]{0,1}/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"items_per_second\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}});
 
-#define ADD_CASES(...) \
-    int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+// ========================================================================= //
+// ------------------------ Testing Label Output --------------------------- //
+// ========================================================================= //
 
-int AddCases(std::vector<TestCase>* out, std::initializer_list<TestCase> const& v) {
-  for (auto const& TC : v)
-    out->push_back(TC);
-  return 0;
+void BM_label(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.SetLabel("some label");
 }
+BENCHMARK(BM_label);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"run_name\": \"BM_label\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"label\": \"some label\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
+                       "label\"%csv_label_report_end$"}});
 
-template <class First>
-std::string join(First f) { return f; }
+// ========================================================================= //
+// ------------------------ Testing Time Label Output ---------------------- //
+// ========================================================================= //
 
-template <class First, class ...Args>
-std::string join(First f, Args&&... args) {
-    return std::string(std::move(f)) + "[ ]+" + join(std::forward<Args>(args)...);
+void BM_time_label_nanosecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
 }
+BENCHMARK(BM_time_label_nanosecond)->Unit(benchmark::kNanosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_nanosecond %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_nanosecond\",$"},
+           {"\"run_name\": \"BM_time_label_nanosecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_nanosecond\",%csv_report$"}});
+
+void BM_time_label_microsecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_microsecond)->Unit(benchmark::kMicrosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_microsecond %console_us_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_microsecond\",$"},
+           {"\"run_name\": \"BM_time_label_microsecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"us\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_microsecond\",%csv_us_report$"}});
+
+void BM_time_label_millisecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_millisecond)->Unit(benchmark::kMillisecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_millisecond %console_ms_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_millisecond\",$"},
+           {"\"run_name\": \"BM_time_label_millisecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ms\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_millisecond\",%csv_ms_report$"}});
+
+void BM_time_label_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_second)->Unit(benchmark::kSecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_second %console_s_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_time_label_second\",$"},
+                       {"\"run_name\": \"BM_time_label_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"s\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_second\",%csv_s_report$"}});
 
-std::string dec_re = "[0-9]+\\.[0-9]+";
+// ========================================================================= //
+// ------------------------ Testing Error Output --------------------------- //
+// ========================================================================= //
 
-}  // end namespace
+void BM_error(benchmark::State& state) {
+  state.SkipWithError("message");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_error);
+ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"run_name\": \"BM_error\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {"\"error_message\": \"message\",$", MR_Next}});
+
+ADD_CASES(TC_CSVOut, {{"^\"BM_error\",,,,,,,,true,\"message\"$"}});
 
 // ========================================================================= //
-// ---------------------- Testing Prologue Output -------------------------- //
+// ------------------------ Testing No Arg Name Output -----------------------
+// //
 // ========================================================================= //
 
-ADD_CASES(&ConsoleOutputTests, {
-    {join("^Benchmark", "Time", "CPU", "Iterations$"), MR_Next},
-    {"^[-]+$", MR_Next}
-});
-ADD_CASES(&CSVOutputTests, {
-  {"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,"
-    "label,error_occurred,error_message"}
-});
+void BM_no_arg_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_no_arg_name)->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
-// ------------------------ Testing Basic Output --------------------------- //
+// ------------------------ Testing Arg Name Output ----------------------- //
 // ========================================================================= //
 
-void BM_basic(benchmark::State& state) {
-  while (state.KeepRunning()) {}
+void BM_arg_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
 }
-BENCHMARK(BM_basic);
-
-ADD_CASES(&ConsoleOutputTests, {
-    {"^BM_basic[ ]+[0-9]{1,5} ns[ ]+[0-9]{1,5} ns[ ]+[0-9]+$"}
-});
-ADD_CASES(&JSONOutputTests, {
-    {"\"name\": \"BM_basic\",$"},
-    {"\"iterations\": [0-9]+,$", MR_Next},
-    {"\"real_time\": [0-9]{1,5},$", MR_Next},
-    {"\"cpu_time\": [0-9]{1,5},$", MR_Next},
-    {"\"time_unit\": \"ns\"$", MR_Next},
-    {"}", MR_Next}
-});
-ADD_CASES(&CSVOutputTests, {
-    {"^\"BM_basic\",[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,$"}
-});
+BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
 
 // ========================================================================= //
-// ------------------------ Testing Error Output --------------------------- //
+// ------------------------ Testing Arg Names Output ----------------------- //
 // ========================================================================= //
 
-void BM_error(benchmark::State& state) {
-    state.SkipWithError("message");
-    while(state.KeepRunning()) {}
+void BM_arg_names(benchmark::State& state) {
+  for (auto _ : state) {
+  }
 }
-BENCHMARK(BM_error);
-ADD_CASES(&ConsoleOutputTests, {
-    {"^BM_error[ ]+ERROR OCCURRED: 'message'$"}
-});
-ADD_CASES(&JSONOutputTests, {
-    {"\"name\": \"BM_error\",$"},
-    {"\"error_occurred\": true,$", MR_Next},
-    {"\"error_message\": \"message\",$", MR_Next}
-});
+BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
-ADD_CASES(&CSVOutputTests, {
-    {"^\"BM_error\",,,,,,,,true,\"message\"$"}
-});
+// ========================================================================= //
+// ------------------------ Testing Big Args Output ------------------------ //
+// ========================================================================= //
 
+void BM_BigArgs(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_BigArgs)->RangeMultiplier(2)->Range(1U << 30U, 1U << 31U);
+ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
+                          {"^BM_BigArgs/2147483648 %console_report$"}});
 
 // ========================================================================= //
 // ----------------------- Testing Complexity Output ----------------------- //
 // ========================================================================= //
 
 void BM_Complexity_O1(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
   }
-  state.SetComplexityN(state.range_x());
+  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1)->Range(1, 1<<18)->Complexity(benchmark::o1);
-
-std::string bigOStr = "[0-9]+\\.[0-9]+ \\([0-9]+\\)";
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+SET_SUBSTITUTIONS({{"%bigOStr", "[ ]* %float \\([0-9]+\\)"},
+                   {"%RMS", "[ ]*[0-9]+ %"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Complexity_O1_BigO %bigOStr %bigOStr[ ]*$"},
+                          {"^BM_Complexity_O1_RMS %RMS %RMS[ ]*$"}});
 
-ADD_CASES(&ConsoleOutputTests, {
-   {join("^BM_Complexity_O1_BigO", bigOStr, bigOStr) + "[ ]*$"},
-   {join("^BM_Complexity_O1_RMS", "[0-9]+ %", "[0-9]+ %") + "[ ]*$"}
-});
+// ========================================================================= //
+// ----------------------- Testing Aggregate Output ------------------------ //
+// ========================================================================= //
 
+// Test that non-aggregate data is printed by default
+void BM_Repeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+// need two repetitions min to be able to output any aggregate output
+BENCHMARK(BM_Repeat)->Repetitions(2);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2_mean %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
+// but for two repetitions, mean and median is the same, so let's repeat..
+BENCHMARK(BM_Repeat)->Repetitions(3);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
+// median differs between even/odd number of repetitions, so just to be sure
+BENCHMARK(BM_Repeat)->Repetitions(4);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4_mean %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_stddev\",%csv_report$"}});
+
+// Test that a non-repeated test still prints non-aggregate results even when
+// only-aggregate reports have been requested
+void BM_RepeatOnce(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
+ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
+
+// Test that non-aggregate data is not reported
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+     {"^BM_SummaryRepeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_median %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+                      {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
+
+// Test that non-aggregate data is not displayed.
+// NOTE: this test is kinda bad. we are only testing the display output.
+//       But we don't check that the file output still contains everything...
+void BM_SummaryDisplay(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryDisplay)->Repetitions(2)->DisplayAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+     {"^BM_SummaryDisplay/repeats:2_mean %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_median %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"^\"BM_SummaryDisplay/repeats:2_mean\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_median\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_stddev\",%csv_report$"}});
+
+// Test repeats with custom time unit.
+void BM_RepeatTimeUnit(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_RepeatTimeUnit)
+    ->Repetitions(3)
+    ->ReportAggregatesOnly()
+    ->Unit(benchmark::kMicrosecond);
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+     {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_time_only_report [ ]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_median %console_us_time_only_report [ "
+      "]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_time_only_report [ "
+      "]*3$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_median\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}});
 
 // ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
+// -------------------- Testing user-provided statistics ------------------- //
 // ========================================================================= //
 
+const auto UserStatistics = [](const std::vector<double>& v) {
+  return v.back();
+};
+void BM_UserStats(benchmark::State& state) {
+  for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
+  }
+}
+// clang-format off
+BENCHMARK(BM_UserStats)
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->ComputeStatistics("", UserStatistics);
+// clang-format on
+
+// check that user-provided stats is calculated, and is after the default-ones
+// empty string as name is intentional, it would sort before anything else
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time_ "
+                           "[ ]* 150 ns %time [ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_median\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_stddev\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
-int main(int argc, char* argv[]) {
-  // Add --color_print=false to argv since we don't want to match color codes.
-  char new_arg[64];
-  char* new_argv[64];
-  std::copy(argv, argv + argc, new_argv);
-  new_argv[argc++] = std::strcpy(new_arg, "--color_print=false");
-  benchmark::Initialize(&argc, new_argv);
-
-  benchmark::ConsoleReporter CR;
-  benchmark::JSONReporter JR;
-  benchmark::CSVReporter CSVR;
-  struct ReporterTest {
-    const char* name;
-    std::vector<TestCase>& output_cases;
-    std::vector<TestCase>& error_cases;
-    benchmark::BenchmarkReporter& reporter;
-    std::stringstream out_stream;
-    std::stringstream err_stream;
-
-    ReporterTest(const char* n,
-                 std::vector<TestCase>& out_tc,
-                 std::vector<TestCase>& err_tc,
-                 benchmark::BenchmarkReporter& br)
-        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
-        reporter.SetOutputStream(&out_stream);
-        reporter.SetErrorStream(&err_stream);
-    }
-  } TestCases[] = {
-      {"ConsoleReporter", ConsoleOutputTests, ConsoleErrorTests, CR},
-      {"JSONReporter", JSONOutputTests, JSONErrorTests, JR},
-      {"CSVReporter", CSVOutputTests, CSVErrorTests, CSVR}
-  };
-
-  // Create the test reporter and run the benchmarks.
-  std::cout << "Running benchmarks...\n";
-  TestReporter test_rep({&CR, &JR, &CSVR});
-  benchmark::RunSpecifiedBenchmarks(&test_rep);
-
-  for (auto& rep_test : TestCases) {
-      std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
-      std::string banner(msg.size() - 1, '-');
-      std::cout << banner << msg << banner << "\n";
-
-      std::cerr << rep_test.err_stream.str();
-      std::cout << rep_test.out_stream.str();
-
-      for (const auto& TC : rep_test.error_cases)
-        TC.Check(rep_test.err_stream);
-      for (const auto& TC : rep_test.output_cases)
-        TC.Check(rep_test.out_stream);
-
-      std::cout << "\n";
+// ========================================================================= //
+// ------------------------- Testing StrEscape JSON ------------------------ //
+// ========================================================================= //
+#if 0  // enable when csv testing code correctly handles multi-line fields
+void BM_JSON_Format(benchmark::State& state) {
+  state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_JSON_Format);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                       {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {R"("error_message": "val\\b\\f\\n\\r\\t\\\\\\"with\\"es,capes",$)", MR_Next}});
+#endif
+// ========================================================================= //
+// -------------------------- Testing CsvEscape ---------------------------- //
+// ========================================================================= //
+
+void BM_CSV_Format(benchmark::State& state) {
+  state.SkipWithError("\"freedom\"");
+  for (auto _ : state) {
   }
-  return 0;
 }
+BENCHMARK(BM_CSV_Format);
+ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index dafbd64..97a2e3c 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -1,10 +1,11 @@
 
 #undef NDEBUG
-#include "benchmark/benchmark.h"
-#include "../src/check.h" // NOTE: check.h is for internal use only!
 #include <cassert>
 #include <vector>
 
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
@@ -18,7 +19,7 @@ class TestReporter : public benchmark::ConsoleReporter {
     ConsoleReporter::ReportRuns(report);
   }
 
-  TestReporter()  {}
+  TestReporter() {}
   virtual ~TestReporter() {}
 
   mutable std::vector<Run> all_runs_;
@@ -32,11 +33,12 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got " << run.benchmark_name;
+    CHECK(name == run.benchmark_name())
+        << "expected " << name << " got " << run.benchmark_name();
     CHECK(error_occurred == run.error_occurred);
     CHECK(error_message == run.error_message);
     if (error_occurred) {
-      //CHECK(run.iterations == 0);
+      // CHECK(run.iterations == 0);
     } else {
       CHECK(run.iterations != 0);
     }
@@ -55,11 +57,15 @@ int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
 
 #define CONCAT(x, y) CONCAT2(x, y)
 #define CONCAT2(x, y) x##y
-#define ADD_CASES(...) \
-int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
 
 }  // end namespace
 
+void BM_error_no_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_no_running);
+ADD_CASES("BM_error_no_running", {{"", true, "error message"}});
 
 void BM_error_before_running(benchmark::State& state) {
   state.SkipWithError("error message");
@@ -68,13 +74,30 @@ void BM_error_before_running(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_error_before_running);
-ADD_CASES("BM_error_before_running",
-          {{"", true, "error message"}});
+ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
+
+void BM_error_before_running_batch(benchmark::State& state) {
+  state.SkipWithError("error message");
+  while (state.KeepRunningBatch(17)) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running_batch);
+ADD_CASES("BM_error_before_running_batch", {{"", true, "error message"}});
+
+void BM_error_before_running_range_for(benchmark::State& state) {
+  state.SkipWithError("error message");
+  for (auto _ : state) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running_range_for);
+ADD_CASES("BM_error_before_running_range_for", {{"", true, "error message"}});
 
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
       assert(first_iter);
       first_iter = false;
       state.SkipWithError("error message");
@@ -85,38 +108,53 @@ void BM_error_during_running(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_during_running",
-    {{"/1/threads:1", true, "error message"},
-    {"/1/threads:2", true, "error message"},
-    {"/1/threads:4", true, "error message"},
-    {"/1/threads:8", true, "error message"},
-    {"/2/threads:1", false, ""},
-    {"/2/threads:2", false, ""},
-    {"/2/threads:4", false, ""},
-    {"/2/threads:8", false, ""}}
-);
+ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"},
+                                      {"/1/threads:2", true, "error message"},
+                                      {"/1/threads:4", true, "error message"},
+                                      {"/1/threads:8", true, "error message"},
+                                      {"/2/threads:1", false, ""},
+                                      {"/2/threads:2", false, ""},
+                                      {"/2/threads:4", false, ""},
+                                      {"/2/threads:8", false, ""}});
+
+void BM_error_during_running_ranged_for(benchmark::State& state) {
+  assert(state.max_iterations > 3 && "test requires at least a few iterations");
+  int first_iter = true;
+  // NOTE: Users should not write the for loop explicitly.
+  for (auto It = state.begin(), End = state.end(); It != End; ++It) {
+    if (state.range(0) == 1) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+      // Test the unfortunate but documented behavior that the ranged-for loop
+      // doesn't automatically terminate when SkipWithError is set.
+      assert(++It != End);
+      break;  // Required behavior
+    }
+  }
+}
+BENCHMARK(BM_error_during_running_ranged_for)->Arg(1)->Arg(2)->Iterations(5);
+ADD_CASES("BM_error_during_running_ranged_for",
+          {{"/1/iterations:5", true, "error message"},
+           {"/2/iterations:5", false, ""}});
 
 void BM_error_after_running(benchmark::State& state) {
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
   }
   if (state.thread_index <= (state.threads / 2))
     state.SkipWithError("error message");
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_after_running",
-    {{"/threads:1", true, "error message"},
-    {"/threads:2", true, "error message"},
-    {"/threads:4", true, "error message"},
-    {"/threads:8", true, "error message"}}
-);
+ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
+                                     {"/threads:2", true, "error message"},
+                                     {"/threads:4", true, "error message"},
+                                     {"/threads:8", true, "error message"}});
 
 void BM_error_while_paused(benchmark::State& state) {
   bool first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range_x() == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
       assert(first_iter);
       first_iter = false;
       state.PauseTiming();
@@ -128,18 +166,14 @@ void BM_error_while_paused(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
-ADD_CASES(
-    "BM_error_while_paused",
-    {{"/1/threads:1", true, "error message"},
-    {"/1/threads:2", true, "error message"},
-    {"/1/threads:4", true, "error message"},
-    {"/1/threads:8", true, "error message"},
-    {"/2/threads:1", false, ""},
-    {"/2/threads:2", false, ""},
-    {"/2/threads:4", false, ""},
-    {"/2/threads:8", false, ""}}
-);
-
+ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"},
+                                    {"/1/threads:2", true, "error message"},
+                                    {"/1/threads:4", true, "error message"},
+                                    {"/1/threads:8", true, "error message"},
+                                    {"/2/threads:1", false, ""},
+                                    {"/2/threads:2", false, ""},
+                                    {"/2/threads:4", false, ""},
+                                    {"/2/threads:8", false, ""}});
 
 int main(int argc, char* argv[]) {
   benchmark::Initialize(&argc, argv);
diff --git a/test/state_assembly_test.cc b/test/state_assembly_test.cc
new file mode 100644
index 0000000..7ddbb3b
--- /dev/null
+++ b/test/state_assembly_test.cc
@@ -0,0 +1,68 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+// clang-format off
+extern "C" {
+  extern int ExternInt;
+  benchmark::State& GetState();
+  void Fn();
+}
+// clang-format on
+
+using benchmark::State;
+
+// CHECK-LABEL: test_for_auto_loop:
+extern "C" int test_for_auto_loop() {
+  State& S = GetState();
+  int x = 42;
+  // CHECK: 	[[CALL:call(q)*]]	_ZN9benchmark5State16StartKeepRunningEv
+  // CHECK-NEXT: testq %rbx, %rbx
+  // CHECK-NEXT: je [[LOOP_END:.*]]
+
+  for (auto _ : S) {
+    // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
+    // CHECK-GNU-NEXT: subq $1, %rbx
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
+    // CHECK-NEXT: jne .L[[LOOP_HEAD]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK: [[LOOP_END]]:
+  // CHECK: [[CALL]]	_ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
+
+// CHECK-LABEL: test_while_loop:
+extern "C" int test_while_loop() {
+  State& S = GetState();
+  int x = 42;
+
+  // CHECK: j{{(e|mp)}} .L[[LOOP_HEADER:[a-zA-Z0-9_]+]]
+  // CHECK-NEXT: .L[[LOOP_BODY:[a-zA-Z0-9_]+]]:
+  while (S.KeepRunning()) {
+    // CHECK-GNU-NEXT: subq $1, %[[IREG:[a-z]+]]
+    // CHECK-CLANG-NEXT: {{(addq \$-1,|decq)}} %[[IREG:[a-z]+]]
+    // CHECK: movq %[[IREG]], [[DEST:.*]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK-DAG: movq [[DEST]], %[[IREG]]
+  // CHECK-DAG: testq %[[IREG]], %[[IREG]]
+  // CHECK-DAG: jne .L[[LOOP_BODY]]
+  // CHECK-DAG: .L[[LOOP_HEADER]]:
+
+  // CHECK: cmpb $0
+  // CHECK-NEXT: jne .L[[LOOP_END:[a-zA-Z0-9_]+]]
+  // CHECK: [[CALL:call(q)*]] _ZN9benchmark5State16StartKeepRunningEv
+
+  // CHECK: .L[[LOOP_END]]:
+  // CHECK: [[CALL]] _ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
diff --git a/test/statistics_gtest.cc b/test/statistics_gtest.cc
new file mode 100644
index 0000000..3ddc72d
--- /dev/null
+++ b/test/statistics_gtest.cc
@@ -0,0 +1,28 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/statistics.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StatisticsTest, Mean) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 5, 10, 10, 14}), 7.0);
+}
+
+TEST(StatisticsTest, Median) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 5, 10, 10}), 5.0);
+}
+
+TEST(StatisticsTest, StdDev) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   1.151086443322134);
+}
+
+}  // end namespace
diff --git a/test/string_util_gtest.cc b/test/string_util_gtest.cc
new file mode 100644
index 0000000..01bf155
--- /dev/null
+++ b/test/string_util_gtest.cc
@@ -0,0 +1,153 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/string_util.h"
+#include "../src/internal_macros.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StringUtilTest, stoul) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+#if ULONG_MAX == 0xFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
+    EXPECT_EQ(10ul, pos);
+  }
+#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(20ul, pos);
+  }
+#endif
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, stoi) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0, benchmark::stoi("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, stod) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    /* Note: exactly representable as double */
+    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+    EXPECT_EQ(8ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
+  }
+#endif
+}
+
+}  // end namespace
diff --git a/test/templated_fixture_test.cc b/test/templated_fixture_test.cc
new file mode 100644
index 0000000..fe9865c
--- /dev/null
+++ b/test/templated_fixture_test.cc
@@ -0,0 +1,28 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <memory>
+
+template <typename T>
+class MyFixture : public ::benchmark::Fixture {
+ public:
+  MyFixture() : data(0) {}
+
+  T data;
+};
+
+BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State& st) {
+  for (auto _ : st) {
+    data += 1;
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, Bar, double)(benchmark::State& st) {
+  for (auto _ : st) {
+    data += 1.0;
+  }
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar);
+
+BENCHMARK_MAIN();
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
new file mode 100644
index 0000000..18373c0
--- /dev/null
+++ b/test/user_counters_tabular_test.cc
@@ -0,0 +1,285 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// @todo: <jpmag> this checks the full output at once; the rule for
+// CounterSet1 was failing because it was not matching "^[-]+$".
+// @todo: <jpmag> check that the counters are vertically aligned.
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        // keeping these lines long improves readability, so:
+        // clang-format off
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
+        // clang-format on
+    });
+ADD_CASES(TC_CSVOut, {{"%csv_header,"
+                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_Counters_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreads}},
+      {"Bar", {2, bm::Counter::kAvgThreads}},
+      {"Baz", {4, bm::Counter::kAvgThreads}},
+      {"Bat", {8, bm::Counter::kAvgThreads}},
+      {"Frob", {16, bm::Counter::kAvgThreads}},
+      {"Lob", {32, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabular(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
+  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
+  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+
+// ========================================================================= //
+// -------------------- Tabular+Rate Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_CounterRates_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
+      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
+      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
+      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
+      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
+  });
+}
+BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabularRate(Results const& e) {
+  double t = e.DurationCPUTime();
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
+                        &CheckTabularRate);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters
+void BM_CounterSet0_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bar", {20, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet0(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
+
+// again.
+void BM_CounterSet1_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {15, bm::Counter::kAvgThreads}},
+      {"Bar", {25, bm::Counter::kAvgThreads}},
+      {"Baz", {45, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet1(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters, different set now.
+void BM_CounterSet2_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bat", {30, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
+                       ",%float,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet2(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
new file mode 100644
index 0000000..5699f4f
--- /dev/null
+++ b/test/user_counters_test.cc
@@ -0,0 +1,531 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+// clang-format off
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
+
+// clang-format on
+
+// ========================================================================= //
+// ------------------------- Simple Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2 * (double)state.iterations();
+}
+BENCHMARK(BM_Counters_Simple);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSimple(Results const& e) {
+  double its = e.NumIterations();
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  // check that the value of bar is within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
+
+// ========================================================================= //
+// --------------------- Counters+Items+Bytes/s Output --------------------- //
+// ========================================================================= //
+
+namespace {
+int num_calls1 = 0;
+}
+void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = ++num_calls1;
+  state.SetBytesProcessed(364);
+  state.SetItemsProcessed(150);
+}
+BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+                           "bar=%hrfloat bytes_per_second=%hrfloat/s "
+                           "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"bytes_per_second\": %float,$", MR_Next},
+           {"\"foo\": %float,$", MR_Next},
+           {"\"items_per_second\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
+                       "%csv_bytes_items_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckBytesAndItemsPSec(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364. / t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
+                        &CheckBytesAndItemsPSec);
+
+// ========================================================================= //
+// ------------------------- Rate Counters Output -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_Rate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
+}
+BENCHMARK(BM_Counters_Rate);
+ADD_CASES(
+    TC_ConsoleOut,
+    {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckRate(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
+
+// ========================================================================= //
+// ----------------------- Inverted Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Invert(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
+  state.counters["bar"] = bm::Counter{10000, bm::Counter::kInvert};
+}
+BENCHMARK(BM_Invert);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Invert %console_report bar=%hrfloatu foo=%hrfloatk$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert\",$"},
+                       {"\"run_name\": \"BM_Invert\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Invert\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvert(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 10000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 0.0001, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
+
+// ========================================================================= //
+// ------------------------- InvertedRate Counters Output
+// -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_InvertedRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsRate | bm::Counter::kInvert};
+  state.counters["bar"] =
+      bm::Counter{8192, bm::Counter::kIsRate | bm::Counter::kInvert};
+}
+BENCHMARK(BM_Counters_InvertedRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_InvertedRate %console_report "
+                           "bar=%hrfloats foo=%hrfloats$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_InvertedRate\",$"},
+           {"\"run_name\": \"BM_Counters_InvertedRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_InvertedRate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckInvertedRate(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / 8192.0, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_InvertedRate", &CheckInvertedRate);
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Threads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
+}
+BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
+                           "%console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
+                        &CheckAvgThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreadsRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
+}
+BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreadsRate(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
+                        &CheckAvgThreadsRate);
+
+// ========================================================================= //
+// ------------------- IterationInvariant Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_IterationInvariant(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsIterationInvariant};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_IterationInvariant);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_IterationInvariant\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIterationInvariant(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
+                        &CheckIterationInvariant);
+
+// ========================================================================= //
+// ----------------- IterationInvariantRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsIterationInvariantRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_kIsIterationInvariantRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kIsIterationInvariantRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIsIterationInvariantRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
+                        &CheckIsIterationInvariantRate);
+
+// ========================================================================= //
+// ------------------- AvgIterations Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgIterations(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterations};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_AvgIterations);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_AvgIterations\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterations(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
+
+// ========================================================================= //
+// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_kAvgIterationsRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kAvgIterationsRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterationsRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate",
+                        &CheckAvgIterationsRate);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/test/user_counters_thousands_test.cc b/test/user_counters_thousands_test.cc
new file mode 100644
index 0000000..21d8285
--- /dev/null
+++ b/test/user_counters_thousands_test.cc
@@ -0,0 +1,173 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Thousands Customisation ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Thousands(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"t0_1000000DefaultBase",
+       bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
+      {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+      {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+  });
+}
+BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_mean %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_median %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
+         "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
+         "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
+    });
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t1_1000000Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t2_1000000Base1024\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t3_1048576Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t4_1048576Base1024\": 0\\.(0)*e\\+(0)*$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_mean\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_median\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/repeats:2_stddev\",%csv_report,0,0,0,0,0$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThousands(Results const& e) {
+  if (e.name != "BM_Counters_Thousands/repeats:2")
+    return;  // Do not check the aggregates!
+
+  // check that the values are within 0.01% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t1_1000000Base1000", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t2_1000000Base1024", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t3_1048576Base1000", EQ, 1024 * 1024, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel
new file mode 100644
index 0000000..5895883
--- /dev/null
+++ b/tools/BUILD.bazel
@@ -0,0 +1,19 @@
+load("@py_deps//:requirements.bzl", "requirement")
+
+py_library(
+    name = "gbench",
+    srcs = glob(["gbench/*.py"]),
+    deps = [
+      requirement("numpy"),
+      requirement("scipy"),
+    ],
+)
+
+py_binary(
+    name = "compare",
+    srcs = ["compare.py"],
+    python_version = "PY2",
+    deps = [
+        ":gbench",
+    ],
+)
diff --git a/tools/compare.py b/tools/compare.py
new file mode 100755
index 0000000..66eed93
--- /dev/null
+++ b/tools/compare.py
@@ -0,0 +1,429 @@
+#!/usr/bin/env python
+
+import unittest
+"""
+compare.py - versatile benchmark output compare tool
+"""
+
+import argparse
+from argparse import ArgumentParser
+import json
+import sys
+import gbench
+from gbench import util, report
+from gbench.util import *
+
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+               "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing optional flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
+               " is not supported.") % output_type)
+        sys.exit(1)
+
+
+def create_parser():
+    parser = ArgumentParser(
+        description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
+    subparsers = parser.add_subparsers(
+        help='This tool has multiple modes of operation:',
+        dest='mode')
+
+    parser_a = subparsers.add_parser(
+        'benchmarks',
+        help='The most simple use-case, compare all the output of these two benchmarks')
+    baseline = parser_a.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    contender = parser_a.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser_a.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_b = subparsers.add_parser(
+        'filters', help='Compare filter one with the filter two of benchmark')
+    baseline = parser_b.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test',
+        metavar='test',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_b.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_b.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_c = subparsers.add_parser(
+        'benchmarksfiltered',
+        help='Compare filter one of first benchmark with filter two of the second benchmark')
+    baseline = parser_c.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_c.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_c.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    return parser
+
+
+def main():
+    # Parse the command line flags
+    parser = create_parser()
+    args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
+    assert not unknown_args
+    benchmark_options = args.benchmark_options
+
+    if args.mode == 'benchmarks':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = ''
+        filter_contender = ''
+
+        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
+
+        description = 'Comparing %s to %s' % (test_baseline, test_contender)
+    elif args.mode == 'filters':
+        test_baseline = args.test[0].name
+        test_contender = args.test[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if filter_baseline == filter_contender, you are analyzing the
+        # stdev
+
+        description = 'Comparing %s to %s (from %s)' % (
+            filter_baseline, filter_contender, args.test[0].name)
+    elif args.mode == 'benchmarksfiltered':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if test_baseline == test_contender and
+        # filter_baseline == filter_contender, you are analyzing the stdev
+
+        description = 'Comparing %s (from %s) to %s (from %s)' % (
+            filter_baseline, test_baseline, filter_contender, test_contender)
+    else:
+        # should never happen
+        print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
+        exit(1)
+
+    check_inputs(test_baseline, test_contender, benchmark_options)
+
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
+    options_baseline = []
+    options_contender = []
+
+    if filter_baseline and filter_contender:
+        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
+        options_contender = ['--benchmark_filter=%s' % filter_contender]
+
+    # Run the benchmarks and report the results
+    json1 = json1_orig = gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline)
+    json2 = json2_orig = gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender)
+
+    # Now, filter the benchmarks so that the difference report can work
+    if filter_baseline and filter_contender:
+        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement)
+        json2 = gbench.report.filter_benchmark(
+            json2_orig, filter_contender, replacement)
+
+    diff_report = gbench.report.get_difference_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_difference_report(
+        diff_report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
+    print(description)
+    for ln in output_lines:
+        print(ln)
+
+    # Optionally, diff and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(diff_report, f_json)
+
+class TestParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = create_parser()
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'gbench',
+            'Inputs')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
+
+    def test_benchmarks_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['d'])
+
+    def test_benchmarks_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_basic(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_filters_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['f'])
+
+    def test_benchmarksfiltered_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarksfiltered_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'f')
+
+    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'g')
+
+
+if __name__ == '__main__':
+    # unittest.main()
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/tools/gbench/Inputs/test1_run1.json b/tools/gbench/Inputs/test1_run1.json
new file mode 100644
index 0000000..601e327
--- /dev/null
+++ b/tools/gbench/Inputs/test1_run1.json
@@ -0,0 +1,119 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.4,
+      "cpu_time": 0.5,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test1_run2.json b/tools/gbench/Inputs/test1_run2.json
new file mode 100644
index 0000000..3cbcf39
--- /dev/null
+++ b/tools/gbench/Inputs/test1_run2.json
@@ -0,0 +1,119 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 25,
+      "cpu_time": 25,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 20833333,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 98.9999999,
+      "cpu_time": 98.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100.9999999,
+      "cpu_time": 100.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 90,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 1.0000e+04,
+      "cpu_time": 1.0000e+04,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 66.665,
+      "cpu_time": 66.664,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.04,
+      "cpu_time": 0.6,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test2_run.json b/tools/gbench/Inputs/test2_run.json
new file mode 100644
index 0000000..15bc698
--- /dev/null
+++ b/tools/gbench/Inputs/test2_run.json
@@ -0,0 +1,81 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_Hi",
+      "iterations": 1234,
+      "real_time": 42,
+      "cpu_time": 24,
+      "time_unit": "ms"
+    },
+    {
+      "name": "BM_Zero",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Zero/4",
+      "iterations": 4000,
+      "real_time": 40,
+      "cpu_time": 40,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero/3",
+      "iterations": 3000,
+      "real_time": 30,
+      "cpu_time": 30,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One",
+      "iterations": 5000,
+      "real_time": 5,
+      "cpu_time": 5,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One/4",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One/3",
+      "iterations": 1500,
+      "real_time": 15,
+      "cpu_time": 15,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Bye",
+      "iterations": 5321,
+      "real_time": 11,
+      "cpu_time": 63,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test3_run0.json b/tools/gbench/Inputs/test3_run0.json
new file mode 100644
index 0000000..49f8b06
--- /dev/null
+++ b/tools/gbench/Inputs/test3_run0.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/Inputs/test3_run1.json b/tools/gbench/Inputs/test3_run1.json
new file mode 100644
index 0000000..acc5ba1
--- /dev/null
+++ b/tools/gbench/Inputs/test3_run1.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 89,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 72,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/gbench/__init__.py b/tools/gbench/__init__.py
new file mode 100644
index 0000000..fce1a1a
--- /dev/null
+++ b/tools/gbench/__init__.py
@@ -0,0 +1,8 @@
+"""Google Benchmark tooling"""
+
+__author__ = 'Eric Fiselier'
+__email__ = 'eric@efcs.ca'
+__versioninfo__ = (0, 5, 0)
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
+
+__all__ = []
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
new file mode 100644
index 0000000..bf29492
--- /dev/null
+++ b/tools/gbench/report.py
@@ -0,0 +1,903 @@
+import unittest
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+import os
+import re
+import copy
+
+from scipy.stats import mannwhitneyu
+
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not utest['have_optimal_repetitions']:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def get_difference_report(
+        json1,
+        json2,
+        utest=False):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    diff_report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            diff_report.append({
+                'name': benchmark_name,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    return diff_report
+
+
+def print_difference_report(
+        json_diff_report,
+        include_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    assert utest is True or utest is False
+
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
+
+    first_col_width = find_longest_name(json_diff_report)
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_diff_report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then skip it.
+        if include_aggregates_only and 'run_type' in benchmark:
+            if benchmark['run_type'] != 'aggregate':
+                continue
+
+        for measurement in benchmark['measurements']:
+            output_strs += [color_format(use_color,
+                                         fmt_str,
+                                         BC_HEADER,
+                                         benchmark['name'],
+                                         first_col_width,
+                                         get_color(measurement['time']),
+                                         measurement['time'],
+                                         get_color(measurement['cpu']),
+                                         measurement['cpu'],
+                                         measurement['real_time'],
+                                         measurement['real_time_other'],
+                                         measurement['cpu_time'],
+                                         measurement['cpu_time_other'],
+                                         endc=BC_ENDC)]
+
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
+
+    return output_strs
+
+
+###############################################################################
+# Unit tests
+
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000, 'real_time': 10, 'real_time_other': 10, 'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000, 'real_time': 50, 'real_time_other': 25, 'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000, 'real_time': 50, 'real_time_other': 100, 'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100, 'real_time': 100, 'real_time_other': 98.9999999, 'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100, 'real_time': 100, 'real_time_other': 101, 'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 90, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000, 'real_time': 100, 'real_time_other': 10000, 'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900, 'real_time': 10000, 'real_time_other': 100, 'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334, 'real_time': 100, 'real_time_other': 67, 'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000, 'real_time': 0.4, 'real_time_other': 0.04, 'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6985353583033387, 'time_pvalue': 0.6985353583033387
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.14891467317876572, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                    }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
new file mode 100644
index 0000000..661c4ba
--- /dev/null
+++ b/tools/gbench/util.py
@@ -0,0 +1,163 @@
+"""util.py - General utilities for running, loading, and processing benchmarks
+"""
+import json
+import os
+import tempfile
+import subprocess
+import sys
+
+# Input file type enumeration
+IT_Invalid = 0
+IT_JSON = 1
+IT_Executable = 2
+
+_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
+def is_executable_file(filename):
+    """
+    Return 'True' if 'filename' names a valid file which is likely
+    an executable. A file is considered an executable if it starts with the
+    magic bytes for a EXE, Mach O, or ELF file.
+    """
+    if not os.path.isfile(filename):
+        return False
+    with open(filename, mode='rb') as f:
+        magic_bytes = f.read(_num_magic_bytes)
+    if sys.platform == 'darwin':
+        return magic_bytes in [
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
+        ]
+    elif sys.platform.startswith('win'):
+        return magic_bytes == b'MZ'
+    else:
+        return magic_bytes == b'\x7FELF'
+
+
+def is_json_file(filename):
+    """
+    Returns 'True' if 'filename' names a valid JSON output file.
+    'False' otherwise.
+    """
+    try:
+        with open(filename, 'r') as f:
+            json.load(f)
+        return True
+    except BaseException:
+        pass
+    return False
+
+
+def classify_input_file(filename):
+    """
+    Return a tuple (type, msg) where 'type' specifies the classified type
+    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
+    string represeting the error.
+    """
+    ftype = IT_Invalid
+    err_msg = None
+    if not os.path.exists(filename):
+        err_msg = "'%s' does not exist" % filename
+    elif not os.path.isfile(filename):
+        err_msg = "'%s' does not name a file" % filename
+    elif is_executable_file(filename):
+        ftype = IT_Executable
+    elif is_json_file(filename):
+        ftype = IT_JSON
+    else:
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
+    return ftype, err_msg
+
+
+def check_input_file(filename):
+    """
+    Classify the file named by 'filename' and return the classification.
+    If the file is classified as 'IT_Invalid' print an error message and exit
+    the program.
+    """
+    ftype, msg = classify_input_file(filename)
+    if ftype == IT_Invalid:
+        print("Invalid input file: %s" % msg)
+        sys.exit(1)
+    return ftype
+
+
+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]
+
+
+def load_benchmark_results(fname):
+    """
+    Read benchmark output from a file and return the JSON object.
+    REQUIRES: 'fname' names a file containing JSON benchmark output.
+    """
+    with open(fname, 'r') as f:
+        return json.load(f)
+
+
+def run_benchmark(exe_name, benchmark_flags):
+    """
+    Run a benchmark specified by 'exe_name' with the specified
+    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
+    real time console output.
+    RETURNS: A JSON object representing the benchmark output
+    """
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+            ['--benchmark_out=%s' % output_name]
+
+    cmd = [exe_name] + benchmark_flags
+    print("RUNNING: %s" % ' '.join(cmd))
+    exitCode = subprocess.call(cmd)
+    if exitCode != 0:
+        print('TEST FAILED...')
+        sys.exit(exitCode)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
+    return json_res
+
+
+def run_or_load_benchmark(filename, benchmark_flags):
+    """
+    Get the results for a specified benchmark. If 'filename' specifies
+    an executable benchmark then the results are generated by running the
+    benchmark. Otherwise 'filename' must name a valid JSON output file,
+    which is loaded and the result returned.
+    """
+    ftype = check_input_file(filename)
+    if ftype == IT_JSON:
+        return load_benchmark_results(filename)
+    if ftype == IT_Executable:
+        return run_benchmark(filename, benchmark_flags)
+    raise ValueError('Unknown file type %s' % ftype)
diff --git a/tools/requirements.txt b/tools/requirements.txt
new file mode 100644
index 0000000..3b3331b
--- /dev/null
+++ b/tools/requirements.txt
@@ -0,0 +1 @@
+scipy>=1.5.0
+\ No newline at end of file
diff --git a/tools/strip_asm.py b/tools/strip_asm.py
new file mode 100755
index 0000000..9030550
--- /dev/null
+++ b/tools/strip_asm.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+strip_asm.py - Cleanup ASM output for the specified file
+"""
+
+from argparse import ArgumentParser
+import sys
+import os
+import re
+
+def find_used_labels(asm):
+    found = set()
+    label_re = re.compile("\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
+    for l in asm.splitlines():
+        m = label_re.match(l)
+        if m:
+            found.add('.L%s' % m.group(1))
+    return found
+
+
+def normalize_labels(asm):
+    decls = set()
+    label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if m:
+            decls.add(m.group(0))
+    if len(decls) == 0:
+        return asm
+    needs_dot = next(iter(decls))[0] != '.'
+    if not needs_dot:
+        return asm
+    for ld in decls:
+        asm = re.sub("(^|\s+)" + ld + "(?=:|\s)", '\\1.' + ld, asm)
+    return asm
+
+
+def transform_labels(asm):
+    asm = normalize_labels(asm)
+    used_decls = find_used_labels(asm)
+    new_asm = ''
+    label_decl = re.compile("^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if not m or m.group(0) in used_decls:
+            new_asm += l
+            new_asm += '\n'
+    return new_asm
+
+
+def is_identifier(tk):
+    if len(tk) == 0:
+        return False
+    first = tk[0]
+    if not first.isalpha() and first != '_':
+        return False
+    for i in range(1, len(tk)):
+        c = tk[i]
+        if not c.isalnum() and c != '_':
+            return False
+    return True
+
+def process_identifiers(l):
+    """
+    process_identifiers - process all identifiers and modify them to have
+    consistent names across all platforms; specifically across ELF and MachO.
+    For example, MachO inserts an additional understore at the beginning of
+    names. This function removes that.
+    """
+    parts = re.split(r'([a-zA-Z0-9_]+)', l)
+    new_line = ''
+    for tk in parts:
+        if is_identifier(tk):
+            if tk.startswith('__Z'):
+                tk = tk[1:]
+            elif tk.startswith('_') and len(tk) > 1 and \
+                    tk[1].isalpha() and tk[1] != 'Z':
+                tk = tk[1:]
+        new_line += tk
+    return new_line
+
+
+def process_asm(asm):
+    """
+    Strip the ASM of unwanted directives and lines
+    """
+    new_contents = ''
+    asm = transform_labels(asm)
+
+    # TODO: Add more things we want to remove
+    discard_regexes = [
+        re.compile("\s+\..*$"), # directive
+        re.compile("\s*#(NO_APP|APP)$"), #inline ASM
+        re.compile("\s*#.*$"), # comment line
+        re.compile("\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"), #global directive
+        re.compile("\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"),
+    ]
+    keep_regexes = [
+
+    ]
+    fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
+    for l in asm.splitlines():
+        # Remove Mach-O attribute
+        l = l.replace('@GOTPCREL', '')
+        add_line = True
+        for reg in discard_regexes:
+            if reg.match(l) is not None:
+                add_line = False
+                break
+        for reg in keep_regexes:
+            if reg.match(l) is not None:
+                add_line = True
+                break
+        if add_line:
+            if fn_label_def.match(l) and len(new_contents) != 0:
+                new_contents += '\n'
+            l = process_identifiers(l)
+            new_contents += l
+            new_contents += '\n'
+    return new_contents
+
+def main():
+    parser = ArgumentParser(
+        description='generate a stripped assembly file')
+    parser.add_argument(
+        'input', metavar='input', type=str, nargs=1,
+        help='An input assembly file')
+    parser.add_argument(
+        'out', metavar='output', type=str, nargs=1,
+        help='The output file')
+    args, unknown_args = parser.parse_known_args()
+    input = args.input[0]
+    output = args.out[0]
+    if not os.path.isfile(input):
+        print(("ERROR: input file '%s' does not exist") % input)
+        sys.exit(1)
+    contents = None
+    with open(input, 'r') as f:
+        contents = f.read()
+    new_contents = process_asm(contents)
+    with open(output, 'w') as f:
+        f.write(new_contents)
+
+
+if __name__ == '__main__':
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2023-02-10 17:17:42 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2023-02-10 17:17:42 +0000
commit	910e0feba4baf9c8306748c8c792ded480cedaa0 (patch)
tree	61f88d9a095a390cab484c109bd55ce293b117b0
parent	db1632fadc79df388a2a7248157c965ee40248a1 (diff)
parent	db4553b1a39ef8ef84a097dfa2e795c0a4df60d8 (diff)
download	google-benchmark-emu-33-release.tar.gz