Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More compile stats #5130

Merged
merged 19 commits into from
Jun 4, 2024
Merged
135 changes: 110 additions & 25 deletions ydb/ci/build_bloat/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import json
from functools import partial
import os
import shutil
from concurrent.futures import ProcessPoolExecutor
Expand All @@ -10,7 +11,9 @@


def sanitize_path(path: str, base_src_dir: str) -> str:
home_dir = os.environ["HOME"]
ya_build_path_chunk = ".ya/build/build_root"
ya_tools_path_chunk = ".ya/tools"
if ya_build_path_chunk in path:
# remove path to before .ya
path = path[path.find(ya_build_path_chunk) :]
Expand All @@ -19,10 +22,19 @@ def sanitize_path(path: str, base_src_dir: str) -> str:
splitted = path.split(os.sep)
del splitted[3:5]
path = os.sep.join(splitted)
elif ya_tools_path_chunk in path:
# remove path to before .ya
path = path[path.find(ya_tools_path_chunk) :]

# remove temporary nodes dir names
splitted = path.split(os.sep)
del splitted[3]
path = os.sep.join(splitted)
else:
if not base_src_dir.endswith("/"):
base_src_dir += "/"
path = path.removeprefix(base_src_dir)
path = path.removeprefix(home_dir)

return "src/" + path

Expand Down Expand Up @@ -110,7 +122,6 @@ def build_include_tree(path: str, build_output_dir: str, base_src_dir: str) -> l
result = []

for time_stamp, ev, path, duration in include_events:

if current_includes_stack:
last_path = current_includes_stack[-1]
prev = path_to_time.get(last_path, 0)
Expand Down Expand Up @@ -190,68 +201,109 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str
return tree


def main(build_output_dir, html_output):
tree = generate_time_bloat(build_output_dir=build_output_dir)
def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], dict]:
print("Processing includes in {}".format(trace_path))

shutil.copytree("html", html_output, dirs_exist_ok=True)

with open(os.path.join(html_output, "bloat.json"), "w") as f:
f.write("var kTree = ")
json.dump(tree, f, indent=4)


def parse_includes(path: str) -> list[tuple[int, str]]:
print("Processing includes in {}".format(path))

with open(path) as f:
with open(trace_path) as f:
obj = json.load(f)

cpp_file = None
include_events = [] # (time, +-1, path)

for event in obj["traceEvents"]:
if event["name"] == "Source":
path = event["args"]["detail"]
path = sanitize_path(path, base_src_dir)
time_stamp = event["ts"]
duration = event["dur"]
include_events.append((time_stamp, +1, path))
include_events.append((time_stamp + duration, -1, path))

include_events.sort(key=lambda event: (event[0], -event[1]))
if event["name"] == "OptModule":
cpp_file = event["args"]["detail"]

path_to_time = {}
current_includes_stack = []
last_time_stamp = None
last_time_stamp = 0
time_breakdown = {} # header/cpp -> (header -> (cnt, total time))

if cpp_file is None:
print("Can't determine cpp file for {}".format(trace_path))
return path_to_time, time_breakdown

include_events.sort(key=lambda event: (event[0], -event[1]))

cpp_file = sanitize_path(cpp_file, base_src_dir)
current_includes_stack = [(cpp_file, 0)]
for time_stamp, ev, path in include_events:
if current_includes_stack:
last_path = current_includes_stack[-1]
last_path, _ = current_includes_stack[-1]
prev = path_to_time.get(last_path, 0)
path_to_time[last_path] = prev + (time_stamp - last_time_stamp) / 1000 / 1000

# add compile breakdown for itself
if last_path not in time_breakdown:
time_breakdown[last_path] = {}

if last_path not in time_breakdown[last_path]:
time_breakdown[last_path][last_path] = [0, 0]

time_breakdown[last_path][last_path][0] = 1 # NB: just 1
time_breakdown[last_path][last_path][1] += (time_stamp - last_time_stamp) / 1000 / 1000

if ev == 1:
current_includes_stack.append(path)
current_includes_stack.append((path, time_stamp))
else:
assert current_includes_stack[-1] == path
current_path, include_ts = current_includes_stack[-1]
assert current_path == path
current_includes_stack.pop()
parent_path = current_includes_stack[-1][0]
if parent_path not in time_breakdown:
time_breakdown[parent_path] = {}

if current_path not in time_breakdown[parent_path]:
time_breakdown[parent_path][current_path] = [0, 0]

time_breakdown[parent_path][current_path][0] += 1
time_breakdown[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000

last_time_stamp = time_stamp

return path_to_time
return path_to_time, time_breakdown


def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict:
time_trace_paths = gather_time_traces(build_output_dir)

path_to_stat = {} # header path -> (total_duration, count)
total_time_breakdown = {} # header/cpp path -> (header -> (inclusion count, time spend) )
with ProcessPoolExecutor() as executor:
res = executor.map(parse_includes, time_trace_paths)
for fn_res in res:
for path, duration in fn_res.items():
path = sanitize_path(path, base_src_dir)
fn = partial(parse_includes, base_src_dir=base_src_dir)
res = executor.map(fn, time_trace_paths)
for path_to_time, time_breakdown in res:
for path, duration in path_to_time.items():
if path not in path_to_stat:
path_to_stat[path] = [0, 0]
path_to_stat[path][0] += duration
path_to_stat[path][1] += 1

for path in time_breakdown:
if path not in total_time_breakdown:
total_time_breakdown[path] = {}

for subpath in time_breakdown[path]:
if subpath not in total_time_breakdown[path]:
total_time_breakdown[path][subpath] = [0, 0]

total_time_breakdown[path][subpath][0] += time_breakdown[path][subpath][0]
total_time_breakdown[path][subpath][1] += time_breakdown[path][subpath][1]

for path in total_time_breakdown:
print("*** {}".format(path))
for subpath in total_time_breakdown[path]:
count, total_time_ms = total_time_breakdown[path][subpath]
print(" {} -> total {:.2f}s (included {} times)".format(subpath, total_time_ms, count))
print("")

result = []

for path, (duration, cnt) in path_to_stat.items():
Expand All @@ -260,13 +312,46 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir:

tree = {}

headers_compile_duration = []

for duration, cnt, path in result:
path_chunks = path.split(os.sep)
path_chunks[-1] = path_chunks[-1] + " (total {} times)".format(cnt)
path_chunks_count = len(path_chunks)
chunks = list(zip(path_chunks, (path_chunks_count - 1) * ["dir"] + ["h"]))
add_to_tree(chunks, int(duration * 1000), tree)
print("{} -> {:.2f}s (aggregated {} times)".format(path, duration, cnt))
headers_compile_duration.append(
{
"path": path,
"inclusion_count": cnt,
"mean_compilation_time_s": duration / cnt,
}
)

time_breakdown = {}

for path in total_time_breakdown:
one_file_breakdown = []
for subpath in total_time_breakdown[path]:
inclusion_count, total_s = total_time_breakdown[path][subpath]
one_file_breakdown.append(
{
"path": subpath,
"inclusion_count": inclusion_count,
"total_time_s": total_s,
}
)
one_file_breakdown.sort(key=lambda val: -val["total_time_s"])
time_breakdown[path] = one_file_breakdown

human_readable_output = {
"headers_compile_duration": headers_compile_duration,
"time_breakdown": time_breakdown,
}

with open(os.path.join(result_dir, "output.json"), "w") as f:
json.dump(human_readable_output, f, indent=4)

propogate_area(tree)
enrich_names_with_sec(tree)
Expand Down
60 changes: 58 additions & 2 deletions ydb/ci/build_bloat/ydb_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,23 @@
"id",
"git_commit_message",
"path",
"sub_path",
]

DATETIME_COLUMNS = [
"git_commit_time",
]

UINT64_COLUMNS = []
UINT64_COLUMNS = [
"inclusion_count",
]

DOUBLE_COLUMNS = ["total_compilation_time_s", "compilation_time_s"]
DOUBLE_COLUMNS = [
"total_compilation_time_s",
"compilation_time_s",
"mean_compilation_time_s",
"total_time_s",
]

ALL_COLUMNS = UTF8_COLUMNS + DATETIME_COLUMNS + UINT64_COLUMNS

Expand Down Expand Up @@ -125,8 +133,12 @@ def main():
with open(os.path.join(args.html_dir_cpp, "output.json")) as f:
cpp_stats = json.load(f)

with open(os.path.join(args.html_dir_headers, "output.json")) as f:
header_stats = json.load(f)

rows = []

# upload into cpp_compile_time
for entry in cpp_stats["cpp_compilation_times"]:
path = entry["path"]
time_s = entry["time_s"]
Expand All @@ -142,6 +154,7 @@ def main():
DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row)
)

# upload into total_compile_time
row = copy.copy(common_parameters)
row["id"] = str(uuid.uuid4())
row["total_compilation_time_s"] = cpp_stats["total_compilation_time"]
Expand All @@ -150,6 +163,49 @@ def main():
DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row)
)

# upload into headers_impact
rows = []
for entry in header_stats["headers_compile_duration"]:
path = entry["path"]
inclusion_count = entry["inclusion_count"]
mean_compilation_time_s = entry["mean_compilation_time_s"]
row = copy.copy(common_parameters)
row["id"] = str(uuid.uuid4())
row["path"] = sanitize_str(path)
row["mean_compilation_time_s"] = mean_compilation_time_s
row["inclusion_count"] = inclusion_count
rows.append(copy.copy(row))

if rows:
row = rows[0]
driver.table_client.bulk_upsert(
DATABASE_PATH + "/code-agility/headers_impact", rows, generate_column_types(row)
)

# upload into compile_breakdown
rows = []
for path in header_stats["time_breakdown"]:
entry = header_stats["time_breakdown"][path]
for sub_entry in entry:
sub_path = sub_entry["path"]
inclusion_count = sub_entry["inclusion_count"]
total_time_s = sub_entry["total_time_s"]

row = copy.copy(common_parameters)
row["id"] = str(uuid.uuid4())
row["path"] = path
row["sub_path"] = sub_path
row["inclusion_count"] = inclusion_count
row["total_time_s"] = total_time_s

rows.append(copy.copy(row))

if rows:
row = rows[0]
driver.table_client.bulk_upsert(
DATABASE_PATH + "/code-agility/compile_breakdown", rows, generate_column_types(row)
)


if __name__ == "__main__":
exit(main())
Loading