Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[llvm] Add the AnghaBench dataset. #210

Merged
merged 1 commit into from
Apr 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler_gym/envs/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ py_library(
name = "datasets",
srcs = [
"__init__.py",
"anghabench.py",
"clgen.py",
"csmith.py",
"llvm_stress.py",
Expand Down
3 changes: 3 additions & 0 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Iterable, Optional

from compiler_gym.datasets import Dataset, TarDatasetWithManifest
from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
Expand Down Expand Up @@ -202,6 +203,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0):
def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset]:
site_data_base = site_data_base or site_data_path("llvm-v0")

yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0)
yield BlasDataset(site_data_base=site_data_base, sort_order=0)
yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
Expand All @@ -217,6 +219,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset


__all__ = [
"AnghaBenchDataset",
"BlasDataset",
"CLgenDataset",
"CsmithDataset",
Expand Down
121 changes: 121 additions & 0 deletions compiler_gym/envs/llvm/datasets/anghabench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import sys
from concurrent.futures import as_completed
from pathlib import Path

from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
from compiler_gym.datasets.benchmark import BenchmarkWithSource
from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
from compiler_gym.util import thread_pool
from compiler_gym.util.filesystem import atomic_file_write


class AnghaBenchDataset(TarDatasetWithManifest):
"""A dataset of C programs curated from GitHub source code.

The dataset is from:

da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
JD-ETH marked this conversation as resolved.
Show resolved Hide resolved
Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
International Symposium on Code Generation and Optimization (CGO),
pp. 378-390. IEEE, 2021.

And is available at:

http://cuda.dcc.ufmg.br/angha/home

Installation
------------

The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
on-demand and cached. The first time each benchmark is used there is an
overhead of compiling it from C to bitcode. This is a one-off cost.
"""

def __init__(self, site_data_base: Path, sort_order: int = 0):
manifest_url, manifest_sha256 = {
"darwin": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2",
"39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
),
"linux": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
"a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
),
}[sys.platform]
super().__init__(
name="benchmark://anghabench-v0",
description="Compile-only C/C++ functions extracted from GitHub",
references={
"Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
"Homepage": "http://cuda.dcc.ufmg.br/angha/",
},
license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
site_data_base=site_data_base,
manifest_urls=[manifest_url],
manifest_sha256=manifest_sha256,
tar_urls=[
"https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz"
],
tar_sha256="85d068e4ce44f2581e3355ee7a8f3ccb92568e9f5bd338bc3a918566f3aff42f",
strip_prefix="AnghaBench-d8034ac8562b8c978376008f4b33df01b8887b19",
tar_compression="gz",
benchmark_file_suffix=".bc",
sort_order=sort_order,
)

def benchmark(self, uri: str) -> Benchmark:
self.install()

benchmark_name = uri[len(self.name) + 1 :]
if not benchmark_name:
raise LookupError(f"No benchmark specified: {uri}")

# The absolute path of the file, without an extension.
path_stem = self.dataset_root / benchmark_name

bitcode_abspath = Path(f"{path_stem}.bc")
c_file_abspath = Path(f"{path_stem}.c")

# If the file does not exist, compile it on-demand.
if not bitcode_abspath.is_file():
if not c_file_abspath.is_file():
raise LookupError(
f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
)

with atomic_file_write(bitcode_abspath) as tmp_path:
compile_cmd = ClangInvocation.from_c_file(
c_file_abspath,
copt=[
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
],
).command(outpath=tmp_path)
subprocess.check_call(compile_cmd, timeout=300)

return BenchmarkWithSource.create(
uri, bitcode_abspath, "function.c", c_file_abspath
)

def compile_all(self):
n = self.size
executor = thread_pool.get_thread_pool_executor()
# Since the dataset is lazily compiled, simply iterating over the full
# set of URIs will compile everything. Do this in parallel.
futures = (
executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
)
for i, future in enumerate(as_completed(futures), start=1):
future.result()
print(
f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
flush=True,
end="",
)
14 changes: 14 additions & 0 deletions tests/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
# LICENSE file in the root directory of this source tree.
load("@rules_python//python:defs.bzl", "py_test")

py_test(
name = "anghabench_test",
timeout = "long",
srcs = ["anghabench_test.py"],
shard_count = 8,
deps = [
"//compiler_gym/envs/llvm",
"//compiler_gym/envs/llvm/datasets",
"//tests:test_main",
"//tests/pytest_plugins:common",
"//tests/pytest_plugins:llvm",
],
)

py_test(
name = "clgen_test",
timeout = "moderate",
Expand Down
71 changes: 71 additions & 0 deletions tests/llvm/datasets/anghabench_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Tests for the AnghaBench dataset."""
import sys
from itertools import islice
from pathlib import Path

import gym
import pytest

import compiler_gym.envs.llvm # noqa register environments
from compiler_gym.envs.llvm import LlvmEnv
from compiler_gym.envs.llvm.datasets import AnghaBenchDataset
from tests.pytest_plugins.common import skip_on_ci
from tests.test_main import main

pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]


@pytest.fixture(scope="module")
def anghabench_dataset() -> AnghaBenchDataset:
env = gym.make("llvm-v0")
try:
ds = env.datasets["anghabench-v0"]
finally:
env.close()
yield ds


def test_anghabench_size(anghabench_dataset: AnghaBenchDataset):
if sys.platform == "darwin":
assert anghabench_dataset.size == 1042908
else:
assert anghabench_dataset.size == 1042976


def test_missing_benchmark_name(anghabench_dataset: AnghaBenchDataset, mocker):
# Mock install() so that on CI it doesn't download and unpack the tarfile.
mocker.patch.object(anghabench_dataset, "install")

with pytest.raises(
LookupError, match=r"^No benchmark specified: benchmark://anghabench-v0$"
):
anghabench_dataset.benchmark("benchmark://anghabench-v0")
anghabench_dataset.install.assert_called_once()

with pytest.raises(
LookupError, match=r"^No benchmark specified: benchmark://anghabench-v0/$"
):
anghabench_dataset.benchmark("benchmark://anghabench-v0/")
assert anghabench_dataset.install.call_count == 2


@skip_on_ci
@pytest.mark.parametrize("index", range(250))
JD-ETH marked this conversation as resolved.
Show resolved Hide resolved
def test_anghabench_random_select(
env: LlvmEnv, anghabench_dataset: AnghaBenchDataset, index: int, tmpwd: Path
):
uri = next(islice(anghabench_dataset.benchmark_uris(), index, None))
benchmark = anghabench_dataset.benchmark(uri)
env.reset(benchmark=benchmark)

assert benchmark.source
benchmark.write_sources_to_directory(tmpwd)
assert (tmpwd / "function.c").is_file()


if __name__ == "__main__":
main()