falconry · vytas7 · Dec 18, 2020 · Dec 19, 2020 · Dec 20, 2020 · Dec 20, 2020
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -39,6 +39,10 @@ jobs:
           - "twine_check"
           - "daphne"
           - "no_optional_packages"
+          - "perf_asgi"
+          - "perf_hello"
+          - "perf_media"
+          - "perf_query"
           # TODO(kgriffs): Re-enable once hug has a chance to address
           # breaking changes in Falcon 3.0
           # - "hug"
@@ -82,33 +86,52 @@ jobs:
 
       - name: Set up Python
         uses: actions/setup-python@v2.1.4
-        if: ${{ matrix.toxenv != 'py35' }}
+        if: ${{ matrix.toxenv != 'py35' && matrix.toxenv != 'perf_asgi' && matrix.toxenv != 'perf_hello' && matrix.toxenv != 'perf_media' && matrix.toxenv != 'perf_query' }}
         with:
           python-version: ${{ matrix.python-version }}
 
+      - name: Set up Python 3.5.2
+        if: ${{ matrix.toxenv == 'py35' }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential curl python3.5 python3.5-dev
+          python3.5 --version
+
       - name: Set up Python 3.8
         uses: actions/setup-python@v2.1.4
         if: ${{ matrix.toxenv == 'py35' }}
         with:
           python-version: 3.8
 
-      - name: Set up Python 3.5.2
-        if: ${{ matrix.toxenv == 'py35' }}
+      - name: Set up Python 3.8 (Ubuntu 20.04 build)
+        if: ${{ matrix.toxenv == 'perf_asgi' || matrix.toxenv == 'perf_hello' || matrix.toxenv == 'perf_media' || matrix.toxenv == 'perf_query' }}
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential curl python3.5 python3.5-dev
-          python3.5 --version
+          sudo apt-get install -y build-essential curl python3.8 python3.8-dev python3.8-distutils python-is-python3
+          curl --silent https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py
+          sudo python3.8 /tmp/get-pip.py
+          sudo pip install coverage tox
 
       - name: Install smoke test dependencies
         if: ${{ matrix.toxenv == 'py38_smoke' || matrix.toxenv == 'py38_smoke_cython' }}
         run: |
           sudo apt-get update
           sudo apt-get install -y libunwind-dev
 
+      - name: Install valgrind
+        if: ${{ matrix.toxenv == 'perf_asgi' || matrix.toxenv == 'perf_hello' || matrix.toxenv == 'perf_media' || matrix.toxenv == 'perf_query' }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y valgrind
+
       - name: Install dependencies
+        if: ${{ matrix.toxenv != 'perf_asgi' && matrix.toxenv != 'perf_hello' && matrix.toxenv != 'perf_media' && matrix.toxenv != 'perf_query' }}
         run: |
           python -m pip install --upgrade pip
           pip install coverage tox
+
+      - name: Print versions
+        run: |
           python --version
           pip --version
           tox --version

diff --git a/.gitignore b/.gitignore
@@ -52,6 +52,9 @@ dash
 # System
 .DS_Store
 
+# Valgrind artefacts
+perf/cachegrind.out.*
+
 # VIM swap files
 .*.swp
 

diff --git a/perf/BASELINE.yaml b/perf/BASELINE.yaml
@@ -0,0 +1,49 @@
+cpython_38:
+  asgi:
+    expected:
+      cost: 106660
+      variation: 0.0001
+    points:
+      - 10000
+      - 15000
+      - 20000
+      - 25000
+    tolerance:
+      - -0.002
+      - +0.001
+  hello:
+    expected:
+      cost: 75950
+      variation: 0.0001
+    points:
+      - 10000
+      - 15000
+      - 20000
+      - 25000
+    tolerance:
+      - -0.002
+      - +0.001
+  media:
+    expected:
+      cost: 198740
+      variation: 0.0001
+    points:
+      - 5000
+      - 7500
+      - 10000
+      - 12500
+    tolerance:
+      - -0.002
+      - +0.001
+  query:
+    expected:
+      cost: 182580
+      variation: 0.0001
+    points:
+      - 5000
+      - 7500
+      - 10000
+      - 12500
+    tolerance:
+      - -0.002
+      - +0.001
diff --git a/perf/cachegrind.py b/perf/cachegrind.py
@@ -0,0 +1,136 @@
+"""
+As per the original author's recommendation, this script was simply copied from
+https://github.com/pythonspeed/cachegrind-benchmarking @ 32d26691.
+
+See also this awesome article by Itamar Turner-Trauring:
+https://pythonspeed.com/articles/consistent-benchmarking-in-ci/.
+
+The original file content follows below.
+
+-------------------------------------------------------------------------------
+
+Proof-of-concept: run_with_cachegrind a program under Cachegrind, combining all the various
+metrics into one single performance metric.
+
+Requires Python 3.
+
+License: https://opensource.org/licenses/MIT
+
+## Features
+
+* Disables ASLR.
+* Sets consistent cache sizes.
+* Calculates a combined performance metric.
+
+For more information see the detailed write up at:
+
+https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
+
+## Usage
+
+This script has no compatibility guarnatees, I recommend copying it into your
+repository.  To use:
+
+$ python3 cachegrind.py ./yourprogram --yourparam=yourvalues
+
+If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value
+(e.g. `export PYTHONHASHSEED=1234`).  Other languages may have similar
+requirements to reduce variability.
+
+The last line printed will be a combined performance metric, but you can tweak
+the script to extract more info, or use it as a library.
+
+Copyright © 2020, Hyphenated Enterprises LLC.
+"""
+
+from typing import List, Dict
+from subprocess import check_call, check_output
+import sys
+from tempfile import NamedTemporaryFile
+
+ARCH = check_output(["uname", "-m"]).strip()
+
+
+def run_with_cachegrind(args_list: List[str]) -> Dict[str, int]:
+    """
+    Run the the given program and arguments under Cachegrind, parse the
+    Cachegrind specs.
+
+    For now we just ignore program output, and in general this is not robust.
+    """
+    temp_file = NamedTemporaryFile("r+")
+    check_call([
+        # Disable ASLR:
+        "setarch",
+        ARCH,
+        "-R",
+        "valgrind",
+        "--tool=cachegrind",
+        # Set some reasonable L1 and LL values, based on Haswell. You can set
+        # your own, important part is that they are consistent across runs,
+        # instead of the default of copying from the current machine.
+        "--I1=32768,8,64",
+        "--D1=32768,8,64",
+        "--LL=8388608,16,64",
+        "--cachegrind-out-file=" + temp_file.name,
+    ] + args_list)
+    return parse_cachegrind_output(temp_file)
+
+
+def parse_cachegrind_output(temp_file):
+    # Parse the output file:
+    lines = iter(temp_file)
+    for line in lines:
+        if line.startswith("events: "):
+            header = line[len("events: "):].strip()
+            break
+    for line in lines:
+        last_line = line
+    assert last_line.startswith("summary: ")
+    last_line = last_line[len("summary:"):].strip()
+    return dict(zip(header.split(), [int(i) for i in last_line.split()]))
+
+
+def get_counts(cg_results: Dict[str, int]) -> Dict[str, int]:
+    """
+    Given the result of run_with_cachegrind(), figure out the parameters we will use for final
+    estimate.
+
+    We pretend there's no L2 since Cachegrind doesn't currently support it.
+
+    Caveats: we're not including time to process instructions, only time to
+    access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind
+    instruction is the same as time to retrieve data if they're both to L1
+    cache.
+    """
+    result = {}
+    d = cg_results
+
+    ram_hits = d["DLmr"] + d["DLmw"] + d["ILmr"]
+
+    l3_hits = d["I1mr"] + d["D1mw"] + d["D1mr"] - ram_hits
+
+    total_memory_rw = d["Ir"] + d["Dr"] + d["Dw"]
+    l1_hits = total_memory_rw - l3_hits - ram_hits
+    assert total_memory_rw == l1_hits + l3_hits + ram_hits
+
+    result["l1"] = l1_hits
+    result["l3"] = l3_hits
+    result["ram"] = ram_hits
+
+    return result
+
+
+def combined_instruction_estimate(counts: Dict[str, int]) -> int:
+    """
+    Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind.
+
+    Multipliers were determined empirically, but some research suggests they're
+    a reasonable approximation for cache time ratios.  L3 is probably too low,
+    but then we're not simulating L2...
+    """
+    return counts["l1"] + (5 * counts["l3"]) + (35 * counts["ram"])
+
+
+if __name__ == "__main__":
+    print(combined_instruction_estimate(get_counts(run_with_cachegrind(sys.argv[1:]))))
diff --git a/perf/conftest.py b/perf/conftest.py
@@ -0,0 +1,123 @@
+# Copyright 2020 by Vytautas Liuolia.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import pathlib
+import platform
+import subprocess
+import sys
+
+import numpy
+import pytest
+import yaml
+
+HERE = pathlib.Path(__file__).resolve().parent
+
+
+def _platform():
+    # TODO(vytas): Add support for Cython, PyPy etc.
+    label = platform.python_implementation().lower()
+    version = ''.join(platform.python_version_tuple()[:2])
+    return f'{label}_{version}'
+
+
+class Gauge:
+    GAUGE_ENV = {
+        'LC_ALL': 'en_US.UTF-8',
+        'LANG': 'en_US.UTF-8',
+        'PYTHONHASHSEED': '0',
+        'PYTHONIOENCODING': 'utf-8',
+    }
+
+    def __init__(self, metric):
+        with open(HERE / 'BASELINE.yaml', encoding='utf-8') as baseline:
+            config = yaml.safe_load(baseline)
+
+        platform_label = _platform()
+        platform_spec = config.get(platform_label)
+        assert platform_spec, (
+            f'no performance baseline established for {platform_label} yet',
+        )
+
+        self._metric = metric
+        self._spec = platform_spec[metric]
+
+    def _fit_data(self, iterations, times):
+        # NOTE(vytas): Least-squares fitting solution straight from
+        #   https://numpy.org/doc/stable/reference/generated/numpy.linalg.lstsq.html
+        x = numpy.array(iterations, dtype=float)
+        y = numpy.array(times, dtype=float)
+        A = numpy.vstack([x, numpy.ones(len(x))]).T
+        (cost, _), residuals, _, _ = numpy.linalg.lstsq(A, y, rcond=None)
+
+        N = len(times)
+        rmsd = math.sqrt(residuals / (N - 2))
+        cv_rmsd = rmsd / numpy.mean(y)
+        return (cost, cv_rmsd)
+
+    def _measure_data_point(self, number):
+        command = (
+            sys.executable,
+            'cachegrind.py',
+            sys.executable,
+            '-m',
+            f'metrics.{self._metric}',
+            str(number),
+        )
+        print('\n\nrunning cachegrind:', ' '.join(command), '\n')
+        output = subprocess.check_output(command, cwd=HERE, env=self.GAUGE_ENV)
+        output = output.decode().strip()
+        print(f'\n{output}')
+
+        return int(output.strip())
+
+    def measure(self):
+        iterations = self._spec['points']
+
+        times = []
+        for number in iterations:
+            times.append(self._measure_data_point(number))
+
+        cost, cv_rmsd = self._fit_data(iterations, times)
+        print('\nestimated cost per iteration:', cost)
+        print('estimated CV of RMSD:', cv_rmsd)
+
+        expected_cost = self._spec['expected']['cost']
+        expected_variation = self._spec['expected']['variation']
+        tolerance = self._spec['tolerance']
+
+        assert cost > expected_cost / 10, (
+            'estimated cost per iteration is very low; is the metric broken?')
+        assert cv_rmsd < expected_variation, (
+            'cachegrind results vary too much between iterations')
+
+        assert cost > expected_cost * (1 + min(tolerance)), (
+            'too good! please revise the baseline if you optimized the code')
+        assert cost < expected_cost * (1 + max(tolerance)), (
+            'performance regression measured!')
+
+
+def pytest_configure(config):
+    config.addinivalue_line('markers', 'asgi: "asgi" performance metric')
+    config.addinivalue_line('markers', 'hello: "hello" performance metric')
+    config.addinivalue_line('markers', 'media: "media" performance metric')
+    config.addinivalue_line('markers', 'query: "query" performance metric')
+
+
+@pytest.fixture()
+def gauge():
+    def _method(metric):
+        Gauge(metric).measure()
+
+    return _method
diff --git a/perf/metrics/__init__.py b/perf/metrics/__init__.py