Skip to content

Commit

Permalink
[llvm] Add Csmith and llvm-stress benchmark generators.
Browse files Browse the repository at this point in the history
This adds two new datasets, csmith-v0 and llvm-stress-v0, that are
parametrized program generators. csmith-v0 uses Csmith to generate C99
programs that are then lowered to bitcode. llvm-stress-v0 generates
random LLVM-IR.

Both generators were developed to stress test compilers, so they have
an above-average chance that a generated benchmark will cause the
compiler to enter an unexpected state.

Issue #45.
  • Loading branch information
ChrisCummins committed Apr 23, 2021
1 parent 743f136 commit b514cba
Show file tree
Hide file tree
Showing 7 changed files with 480 additions and 0 deletions.
2 changes: 2 additions & 0 deletions compiler_gym/envs/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ py_library(
name = "datasets",
srcs = [
"__init__.py",
"csmith.py",
"llvm_stress.py",
"poj104.py",
],
visibility = ["//visibility:public"],
Expand Down
7 changes: 7 additions & 0 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from typing import Iterable, Optional

from compiler_gym.datasets import Dataset, TarDatasetWithManifest
from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset
from compiler_gym.util.runfiles_path import site_data_path

Expand Down Expand Up @@ -200,8 +202,10 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
site_data_base = site_data_base or site_data_path("llvm-v0")

yield BlasDataset(site_data_base=site_data_base, sort_order=0)
yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
yield GitHubDataset(site_data_base=site_data_base, sort_order=0)
yield LinuxDataset(site_data_base=site_data_base, sort_order=0)
yield LlvmStressDataset(site_data_base=site_data_base, sort_order=0)
yield MibenchDataset(site_data_base=site_data_base, sort_order=0)
yield NPBDataset(site_data_base=site_data_base, sort_order=0)
yield OpenCVDataset(site_data_base=site_data_base, sort_order=0)
Expand All @@ -212,9 +216,12 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset

__all__ = [
"BlasDataset",
"CsmithDataset",
"CsmithBenchmark",
"get_llvm_datasets",
"GitHubDataset",
"LinuxDataset",
"LlvmStressDataset",
"MibenchDataset",
"NPBDataset",
"OpenCVDataset",
Expand Down
268 changes: 268 additions & 0 deletions compiler_gym/envs/llvm/datasets/csmith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import io
import logging
import subprocess
import tarfile
import tempfile
from pathlib import Path
from threading import Lock
from typing import Iterable, List, Optional

from fasteners import InterProcessLock

from compiler_gym.datasets import Benchmark, BenchmarkSource, Dataset
from compiler_gym.datasets.benchmark import BenchmarkInitError, BenchmarkWithSource
from compiler_gym.datasets.dataset import DatasetInitError
from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
from compiler_gym.util.decorators import memoized_property
from compiler_gym.util.download import download
from compiler_gym.util.runfiles_path import transient_cache_path
from compiler_gym.util.truncate import truncate

# The maximum value for the --seed argument to csmith.
UINT_MAX = (2 ** 32) - 1


class CsmithBenchmark(BenchmarkWithSource):
"""A CSmith benchmark."""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._src = None

@classmethod
def create(cls, uri: str, bitcode: bytes, src: bytes) -> Benchmark:
"""Create a benchmark from paths."""
benchmark = cls.from_file_contents(uri, bitcode)
benchmark._src = src # pylint: disable=protected-access
return benchmark

@memoized_property
def sources(self) -> Iterable[BenchmarkSource]:
return [
BenchmarkSource(filename="source.c", contents=self._src),
]

@property
def source(self) -> str:
"""Return the single source file contents as a string."""
return self._src.decode("utf-8")


class CsmithDataset(Dataset):
"""A dataset which uses Csmith to generate programs.
Csmith is a tool that can generate random conformant C99 programs. It is
described in the publication:
Yang, Xuejun, Yang Chen, Eric Eide, and John Regehr. "Finding and
understanding bugs in C compilers." In Proceedings of the 32nd ACM
SIGPLAN conference on Programming Language Design and Implementation
(PLDI), pp. 283-294. 2011.
For up-to-date information about Csmith, see:
https://embed.cs.utah.edu/csmith/
Note that Csmith is a tool that is used to find errors in compilers. As
such, there is a higher likelihood that the benchmark cannot be used for an
environment and that :meth:`env.reset()
<compiler_gym.envs.CompilerEnv.reset>` will raise
:class:`compiler_gym.datasets.BenchmarkInitError`.
Installation
------------
Using the CsmithDataset requires building the Csmith binary from source.
This is done automatically on the first call to :code:`install()`. Building
Csmith requires a working C++ toolchain. Install the required dependencies
using: :code:`sudo apt install -y g++ m4` on Linux, or :code:`brew install
m4` on macOS. :class:`DatasetInitError` is raised if compilation fails. See
the `Csmith repo <https://github.com/csmith-project/csmith#install-csmith>`_
for further details.
"""

def __init__(self, site_data_base: Path, sort_order: int = 0):
super().__init__(
name="generator://csmith-v0",
description="Random conformant C99 programs",
references={
"Paper": "http://web.cse.ohio-state.edu/~rountev.1/5343/pdf/pldi11.pdf",
"Homepage": "https://embed.cs.utah.edu/csmith/",
},
license="BSD",
site_data_base=site_data_base,
sort_order=sort_order,
benchmark_class=CsmithBenchmark,
)
self.csmith_path = self.site_data_path / "bin" / "csmith"
csmith_include_dir = self.site_data_path / "include" / "csmith-2.3.0"

self._installed = False
self._build_lock = Lock()
self._build_lockfile = self.site_data_path / "build.LOCK"
self._build_markerfile = self.site_data_path / ".built"

# The command that is used to compile an LLVM-IR bitcode file from a
# Csmith input. Reads from stdin, writes to stdout.
self.clang_compile_command: List[str] = ClangInvocation.from_c_file(
"-", # Read from stdin.
copt=[
"-xc",
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
f"-I{csmith_include_dir}", # Include the Csmith headers.
],
).command(
outpath="-"
) # Write to stdout.

@property
def installed(self) -> bool:
# Fast path for repeated checks to 'installed' without a disk op.
if not self._installed:
self._installed = self._build_markerfile.is_file()
return self._installed

def install(self) -> None:
"""Download and build the Csmith binary."""
if self.installed:
return

with self._build_lock, InterProcessLock(self._build_lock):
# Repeat the check to see if we have already installed the dataset
# now that we have acquired the lock.
if not self.installed:
self.logger.info("Downloading and building Csmith")
self._build_csmith(self.site_data_path, self.logger)

@staticmethod
def _build_csmith(install_root: Path, logger: logging.Logger):
"""Download, build, and install Csmith to the given directory."""
tar_data = io.BytesIO(
download(
urls=[
"https://github.com/csmith-project/csmith/archive/refs/tags/csmith-2.3.0.tar.gz",
],
sha256="ba871c1e5a05a71ecd1af514fedba30561b16ee80b8dd5ba8f884eaded47009f",
)
)
# Csmith uses a standard `configure` + `make install` build process.
with tempfile.TemporaryDirectory(
dir=transient_cache_path("."), prefix="csmith-"
) as d:
with tarfile.open(fileobj=tar_data, mode="r:gz") as arc:
arc.extractall(d)

# The path of the extracted sources.
src_dir = Path(d) / "csmith-csmith-2.3.0"

logger.debug("Configuring Csmith at %s", d)
configure = subprocess.Popen(
["./configure", f"--prefix={install_root}"],
cwd=src_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
stdout, stderr = configure.communicate(timeout=600)
if configure.returncode:
raise DatasetInitError(
"\n".join(
[
"Failed to build Csmith from source, `./configure` failed.",
"You may be missing installation dependencies. Install them using:",
" linux: `sudo apt install g++ m4`",
" macOS: `brew install m4`",
"See https://github.com/csmith-project/csmith#install-csmith for more details",
"--- Start `./configure` logs: ---\n",
stdout,
stderr,
]
)
)

logger.debug("Installing Csmith to %s", install_root)
make = subprocess.Popen(
["make", "-j", "install"],
cwd=src_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
stdout, stderr = make.communicate(timeout=600)
if make.returncode:
raise DatasetInitError(
"\n".join(
[
"Failed to build Csmith from source, `make install` failed.",
"You may be missing installation dependencies. Install them using:",
" linux: `sudo apt install g++ m4`",
" macOS: `brew install m4`",
"See https://github.com/csmith-project/csmith#install-csmith for more details",
"--- Start `make install` logs: ---\n",
stdout,
stderr,
]
)
)

(install_root / ".built").touch()

@property
def size(self) -> int:
# Actually 2^32 - 1, but practically infinite for all intents and
# purposes.
return float("inf")

def benchmark_uris(self) -> Iterable[str]:
return (f"{self.name}/{i}" for i in range(UINT_MAX))

def benchmark(self, uri: Optional[str] = None) -> CsmithBenchmark:
self.install()

if uri is None or len(uri) <= len(self.name) + 1:
seed = self.random.integers(UINT_MAX)
else:
seed = int(uri.split("/")[-1])

# Run csmith with the given seed and pipe the output to clang to
# assemble a bitcode.
self.logger.debug("Exec csmith --seed %d", seed)
csmith = subprocess.Popen(
[str(self.csmith_path), "--seed", str(seed)],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)

# Generate the C source.
src, stderr = csmith.communicate(timeout=300)
if csmith.returncode:
error = truncate(stderr.decode("utf-8"), max_lines=20, max_line_len=100)
raise OSError(f"Csmith failed with seed {seed}\nError: {error}")

# Compile to IR.
clang = subprocess.Popen(
self.clang_compile_command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = clang.communicate(src, timeout=300)

if csmith.returncode:
raise OSError(f"Csmith failed with seed {seed}")
if clang.returncode:
compile_cmd = " ".join(self.clang_compile_command)
error = truncate(stderr.decode("utf-8"), max_lines=20, max_line_len=100)
raise BenchmarkInitError(
f"Compilation job failed!\n"
f"Csmith seed: {seed}\n"
f"Command: {compile_cmd}\n"
f"Error: {error}"
)

return self.benchmark_class.create(f"{self.name}/{seed}", stdout, src)
80 changes: 80 additions & 0 deletions compiler_gym/envs/llvm/datasets/llvm_stress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
from pathlib import Path
from typing import Iterable, Optional

from compiler_gym.datasets import Benchmark, Dataset
from compiler_gym.datasets.benchmark import BenchmarkInitError
from compiler_gym.third_party import llvm

# The maximum value for the --seed argument to llvm-stress.
UINT_MAX = (2 ** 32) - 1


class LlvmStressDataset(Dataset):
"""A dataset which uses llvm-stress to generate programs.
`llvm-stress <https://llvm.org/docs/CommandGuide/llvm-stress.html>`_ is a
tool for generating random LLVM-IR files.
This dataset forces reproducible results by setting the input seed to the
generator. The benchmark's URI is the seed, e.g.
"generator://llvm-stress-v0/10" is the benchmark generated by llvm-stress
using seed 10. The total number of unique seeds is 2^32 - 1.
Note that llvm-stress is a tool that is used to find errors in LLVM. As
such, there is a higher likelihood that the benchmark cannot be used for an
environment and that :meth:`env.reset()
<compiler_gym.envs.CompilerEnv.reset>` will raise
:class:`compiler_gym.datasets.BenchmarkInitError`.
"""

def __init__(self, site_data_base: Path, sort_order: int = 0):
super().__init__(
name="generator://llvm-stress-v0",
description="Randomly generated LLVM-IR",
references={
"Documentation": "https://llvm.org/docs/CommandGuide/llvm-stress.html"
},
license="Apache License v2.0 with LLVM Exceptions",
site_data_base=site_data_base,
sort_order=sort_order,
)

@property
def size(self) -> int:
# Actually 2^32 - 1, but practically infinite for all intents and
# purposes.
return float("inf")

def benchmark_uris(self) -> Iterable[str]:
return (f"{self.name}/{i}" for i in range(UINT_MAX))

def benchmark(self, uri: Optional[str] = None) -> Benchmark:
if uri is None or len(uri) <= len(self.name) + 1:
seed = self.random.integers(UINT_MAX)
else:
seed = int(uri.split("/")[-1])

# Run llvm-stress with the given seed and pipe the output to llvm-as to
# assemble a bitcode.
llvm_stress = subprocess.Popen(
[str(llvm.llvm_stress_path()), f"--seed={seed}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
llvm_as = subprocess.Popen(
[str(llvm.llvm_as_path()), "-"],
stdin=llvm_stress.stdout,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)

stdout, _ = llvm_as.communicate(timeout=60)
if llvm_stress.returncode or llvm_as.returncode:
raise BenchmarkInitError("Failed to generate benchmark")

return Benchmark.from_file_contents(f"{self.name}/{seed}", stdout)
Loading

0 comments on commit b514cba

Please sign in to comment.