Skip to content

Commit

Permalink
Merge pull request #106 from pythonspeed/105-cachegrind-benchmarks
Browse files Browse the repository at this point in the history
Add cachegrind-based benchmarks.
  • Loading branch information
itamarst authored Dec 29, 2020
2 parents 3f9e508 + 0b6249b commit be580f7
Show file tree
Hide file tree
Showing 47 changed files with 523 additions and 73 deletions.
1 change: 1 addition & 0 deletions .changelog/105.minor
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added benchmarking system based on Cachegrind.
45 changes: 45 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,48 @@ jobs:
. venv/bin/activate
twine check dist/*.whl
twine upload --repository pypi dist/*.whl
benchmarks:
name: "Benchmarks"
runs-on: "ubuntu-latest"

defaults:
run:
shell: bash -l {0}

env:
PYTHONFAULTHANDLER: "true"

steps:
- uses: "actions/checkout@v2"
with:
# We need tags to get the correct code version:
fetch-depth: 0

- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: false
activate-environment: benchmark
environment-file: benchmarks/conda-linux-64.lock

- name: Benchmark
run: |
set -euo pipefail
ln -s $CONDA_PREFIX venv
make
make _fil-python
make benchmark
echo "## Benchmark results" > benchmark.diff
echo '```diff' >> benchmark.diff
git diff --word-diff benchmarks/results/ >> benchmark.diff
echo '```' >> benchmark.diff
printf '{ "body": ' > benchmark.json
cat benchmark.diff | jq -R -s >> benchmark.json
printf '}' >> benchmark.json
cat benchmark.json
curl \
-X POST \
${{ github.event.pull_request.comments_url }} \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
--data @benchmark.json
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ build/
dist/
fil-result/
.eggs/
python-benchmarks/pymalloc.c
benchmarks/pymalloc.c
tests/test-scripts/pymalloc.c
filprofiler/_version.py
.ipynb_checkpoints
pip-wheel-metadata
pip-wheel-metadata
filprofiler/_fil-python
36 changes: 31 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ build: target/release/libpymemprofile_api.a
python setup.py build_ext --inplace
python setup.py install_data

# Only necessary for benchmarks, only works with Python 3.8 for now.
.PHONY: _fil-python
_fil-python: filprofiler/*.c target/release/libpymemprofile_api.a
gcc -std=c11 $(shell python3.8-config --cflags) -DFIL_SKIP_ALIGNED_ALLOC=1 -export-dynamic -flto -o ${CONDA_PREFIX}/bin/_fil-python $^ -lpython3.8 $(shell python3.8-config --ldflags)

target/release/libpymemprofile_api.a: Cargo.lock memapi/Cargo.toml memapi/src/*.rs
cargo build --release

Expand All @@ -33,10 +38,10 @@ test-python: build

.PHONY: test-python-no-deps
test-python-no-deps:
cythonize -3 -i python-benchmarks/pymalloc.pyx
c++ -shared -fPIC -lpthread python-benchmarks/cpp.cpp -o python-benchmarks/cpp.so
cc -shared -fPIC -lpthread python-benchmarks/malloc_on_thread_exit.c -o python-benchmarks/malloc_on_thread_exit.so
cd python-benchmarks && python -m numpy.f2py -c fortran.f90 -m fortran
cythonize -3 -i tests/test-scripts/pymalloc.pyx
c++ -shared -fPIC -lpthread tests/test-scripts/cpp.cpp -o tests/test-scripts/cpp.so
cc -shared -fPIC -lpthread tests/test-scripts/malloc_on_thread_exit.c -o tests/test-scripts/malloc_on_thread_exit.so
cd tests/test-scripts && python -m numpy.f2py -c fortran.f90 -m fortran
env RUST_BACKTRACE=1 py.test tests/

.PHONY: docker-image
Expand All @@ -53,7 +58,7 @@ manylinux-wheel:

.PHONY: clean
clean:
rm -f filprofiler/fil-python
rm -f filprofiler/_fil-python
rm -rf target
rm -rf filprofiler/*.so
rm -rf filprofiler/*.dylib
Expand All @@ -68,3 +73,24 @@ licenses:
data_kernelspec/kernel.json: generate-kernelspec.py
rm -rf data_kernelspec
python generate-kernelspec.py

.PHONY: benchmark
benchmark: _fil-python
# Possibly some cache warming is still necessary :(
make benchmarks/results/*.json
make benchmarks/results/*.json
python setup.py --version > benchmarks/results/version.txt
git diff --word-diff benchmarks/results/

.PHONY: benchmarks/results/pystone.json
benchmarks/results/pystone.json:
_RJEM_MALLOC_CONF=dirty_decay_ms:-1,muzzy_decay_ms:-1,abort_conf:true FIL_NO_REPORT=1 FIL_BENCHMARK=benchmarks/results/pystone.json fil-profile run benchmarks/pystone.py

.PHONY: benchmarks/results/lots-of-peaks.json
benchmarks/results/lots-of-peaks.json:
_RJEM_MALLOC_CONF=dirty_decay_ms:-1,muzzy_decay_ms:-1,abort_conf:true FIL_NO_REPORT=1 FIL_BENCHMARK=benchmarks/results/lots-of-peaks.json fil-profile run benchmarks/lots-of-peaks.py

.PHONY: benchmarks/results/multithreading-1.json
benchmarks/results/multithreading-1.json:
cythonize -3 -i benchmarks/pymalloc.pyx
_RJEM_MALLOC_CONF=dirty_decay_ms:-1,muzzy_decay_ms:-1,abort_conf:true FIL_NO_REPORT=1 FIL_BENCHMARK=benchmarks/results/multithreading-1.json fil-profile run benchmarks/multithreading.py 1
38 changes: 38 additions & 0 deletions benchmarks/conda-linux-64.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# platform: linux-64
# env_hash: d316fdb443d4fe1bffbc0234ee25973066694b344ca70b0d9e51b05bccf57b8b
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-1_gnu.tar.bz2#561e277319a41d4f24f5c05a9ef63c04
https://conda.anaconda.org/conda-forge/linux-64/binutils-2.35.1-hdd6e379_1.tar.bz2#1a91ad01b06912768bc5d66631fc9b07
https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.35.1-h17ad2fc_1.tar.bz2#9902133effcb084679365a0789d483f3
https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.35-hc3fd857_29.tar.bz2#17d622904723a89e59ac9251aa432078
https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.1.3-h7f98852_0.tar.bz2#1aaf091c2aec0e80c3b6ad93e2b75025
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2020.12.5-ha878542_0.tar.bz2#7eb5d4ffeee663caa1635cd67071bc1b
https://conda.anaconda.org/conda-forge/linux-64/certifi-2020.12.5-py38h578d9bd_0.tar.bz2#a3daf84221215a6caa8a4efc9b04c773
https://conda.anaconda.org/conda-forge/linux-64/cython-0.29.15-py38h950e882_1.tar.bz2#1f3c0a4536fba9094b85918f6bfde736
https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-9.3.0-h28f5a38_17.tar.bz2#40235a65140e7f9a10716f555f7ce409
https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-9.3.0-h7247604_29.tar.bz2#081695d5c895c0491ce4d8e8d4ade215
https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-2.6.32-h77966d4_13.tar.bz2#182b3bbe97ca334be3ccb50b80810bb1
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.35.1-hed1e6ac_1.tar.bz2#a267624dbd39c49e945d29dca1538635
https://conda.anaconda.org/conda-forge/linux-64/libffi-3.3-h58526e2_2.tar.bz2#665369991d8dd290ac5ee92fce3e6bf5
https://conda.anaconda.org/conda-forge/linux-64/libgcc-devel_linux-64-9.3.0-hfd08b2a_17.tar.bz2#0134fffc3c28ccfef3d42ecd8671613a
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-9.3.0-h5dbcf3e_17.tar.bz2#fc9f5adabc4d55cd4b491332adc413e0
https://conda.anaconda.org/conda-forge/linux-64/libgomp-9.3.0-h5dbcf3e_17.tar.bz2#8fd587013b9da8b52050268d50c12305
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-9.3.0-h2ae2ef3_17.tar.bz2#342f3c931d0a3a209ab09a522469d20c
https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.2-h58526e2_4.tar.bz2#509f2a21c4a09214cd737a480dfd80c9
https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1i-h7f98852_0.tar.bz2#bc63956d32024c8e0853ff2ca7f2bf05
https://conda.anaconda.org/conda-forge/noarch/pip-20.3.3-pyhd8ed1ab_0.tar.bz2#c7e5c449e502567a995e4bd09c538faf
https://conda.anaconda.org/conda-forge/linux-64/python-3.8.6-hffdb5ce_4_cpython.tar.bz2#e43784498e6c5983eb80ab00e7ba1c7c
https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.8-1_cp38.tar.bz2#8d05152d6fb3012b27a0e6fbcc14bea1
https://conda.anaconda.org/conda-forge/linux-64/readline-8.0-he28a2e2_2.tar.bz2#4d0ae8d473f863696088f76800ef9d38
https://conda.anaconda.org/conda-forge/linux-64/rust-1.48.0-h36c2ea0_0.tar.bz2#89ab86b925452d3285cc4005fe7c882d
https://conda.anaconda.org/conda-forge/noarch/rust-std-x86_64-unknown-linux-gnu-1.48.0-hc1431ca_0.tar.bz2#39758df04142487a78bf5d7ec0c1a1da
https://conda.anaconda.org/conda-forge/linux-64/setuptools-49.6.0-py38h924ce5b_2.tar.bz2#85f49978317e2f963ffd1d0989b16038
https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.34.0-h74cdb3f_0.tar.bz2#0a83e21e8c1929cc9a1e21ebb2459fc5
https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.12-h77966d4_13.tar.bz2#e411486a18c4f61c59083c5792c1ce3b
https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-2.1.0-pyh5ca1d4c_0.tar.bz2#9eb32b63458380aa3804de519b6e5749
https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.10-hed695b0_1.tar.bz2#7ef837cd455bd0f19f49b8b62d4cb568
https://conda.anaconda.org/conda-forge/linux-64/valgrind-3.15.0-he513fc3_0.tar.bz2#917c1af93e48c45c708f48d8b9d4d886
https://conda.anaconda.org/conda-forge/noarch/wheel-0.36.2-pyhd3deb0d_0.tar.bz2#768bfbe026426d0e76b377997d1f2b98
https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.5-h516909a_1.tar.bz2#33f601066901f3e1a85af3522a8113f9
https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h516909a_1010.tar.bz2#339cc5584e6d26bc73a875ba900028c3
10 changes: 10 additions & 0 deletions benchmarks/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: unpinned
channels:
- conda-forge
dependencies:
- python=3.8
- cython=0.29.15
- threadpoolctl=2.1.0
- c-compiler=1.1.3
- valgrind=3.15.0
- rust=1.48.0
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import sys

import numpy as np

Expand All @@ -10,7 +11,12 @@

start = time.time()
image = data.camera()
image = rescale(image, 4, anti_aliasing=True)

if len(sys.argv) > 1:
scale = int(sys.argv[1])
else:
scale = 4
image = rescale(image, scale, anti_aliasing=True)

shift = (-22.4, 13.32)
# The shift corresponds to the pixel offset relative to the reference image
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/lots-of-peaks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Generate lots of callstacks with lots of new peaks.
Don't use loops in order to maximize callstacks.
"""

L = []


def f():
g()
g()
g()
g()
g()
g()
g()
g()
g()
g()
g()
g()
# Recursion instead of a loop, so we get more callstack IDs.
if len(L) < 100_000:
f()


def g():
h()
h()
h()
h()
h()
h()
h()
h()
h()
h()
h()
h()


def h():
# Increase allocated memory, and also deallocate some memory to trigger
# check for new peaks()
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x
L.append(list())
x = list()
del x


if __name__ == "__main__":
f()
24 changes: 24 additions & 0 deletions benchmarks/multithreading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import time
import sys

from threading import Thread
from pymalloc import lots_of_allocs


def main():
start = time.time()
num_threads = int(sys.argv[1])
# If there's only one-thread, just run in main thread:
if num_threads == 1:
lots_of_allocs()
else:
threads = [Thread(target=lots_of_allocs) for i in range(num_threads)]
for t in threads:
t.start()
for t in threads:
t.join()
print("Elapsed:", time.time() - start)


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions benchmarks/pymalloc.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from libc.stdlib cimport malloc, free
from libc.stdint cimport uint64_t

def lots_of_allocs():
cdef uint64_t i
with nogil:
for i in range(10000000):
p = <uint64_t*>malloc(16)
p[0] = 1
free(p)
# Garbage, but without this the compiler optimizes the whole loop out.
return p[0]
File renamed without changes.
File renamed without changes.
5 changes: 5 additions & 0 deletions benchmarks/results/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Benchmark results

The files in this directory are the results of running `make benchmark`.
They are the raw data from running Cachegrind, and an overall performance metric.
For more details on the calculation [see this article](https://pythonspeed.com/articles/consistent-benchmarking-in-ci/).
12 changes: 12 additions & 0 deletions benchmarks/results/lots-of-peaks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"D1mr": 5978523,
"D1mw": 3953746,
"DLmr": 2137435,
"DLmw": 2799987,
"Dr": 274430190,
"Dw": 110583840,
"I1mr": 5066143,
"ILmr": 11684,
"Ir": 1053389932,
"Overall": 1646870790
}
12 changes: 12 additions & 0 deletions benchmarks/results/multithreading-1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"D1mr": 3109150,
"D1mw": 438384,
"DLmr": 90549,
"DLmw": 106394,
"Dr": 1800370393,
"Dw": 653386563,
"I1mr": 2308884,
"ILmr": 4370,
"Ir": 6384366308,
"Overall": 8867588326
}
12 changes: 12 additions & 0 deletions benchmarks/results/pystone.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"D1mr": 11404735,
"D1mw": 3128624,
"DLmr": 92190,
"DLmw": 100439,
"Dr": 503830192,
"Dw": 225400619,
"I1mr": 10615779,
"ILmr": 4477,
"Ir": 1420914043,
"Overall": 2256654586
}
1 change: 1 addition & 0 deletions benchmarks/results/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.13.2.dev66+gcb0c19b.d20201229
Loading

0 comments on commit be580f7

Please sign in to comment.