diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 2e0f831c0d83..317bb7c12ba3 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -2,6 +2,7 @@ import argparse import json +from functools import partial import os import shutil from concurrent.futures import ProcessPoolExecutor @@ -10,7 +11,9 @@ def sanitize_path(path: str, base_src_dir: str) -> str: + home_dir = os.environ["HOME"] ya_build_path_chunk = ".ya/build/build_root" + ya_tools_path_chunk = ".ya/tools" if ya_build_path_chunk in path: # remove path to before .ya path = path[path.find(ya_build_path_chunk) :] @@ -19,10 +22,19 @@ def sanitize_path(path: str, base_src_dir: str) -> str: splitted = path.split(os.sep) del splitted[3:5] path = os.sep.join(splitted) + elif ya_tools_path_chunk in path: + # remove path to before .ya + path = path[path.find(ya_tools_path_chunk) :] + + # remove temporary nodes dir names + splitted = path.split(os.sep) + del splitted[3] + path = os.sep.join(splitted) else: if not base_src_dir.endswith("/"): base_src_dir += "/" path = path.removeprefix(base_src_dir) + path = path.removeprefix(home_dir) return "src/" + path @@ -110,7 +122,6 @@ def build_include_tree(path: str, build_output_dir: str, base_src_dir: str) -> l result = [] for time_stamp, ev, path, duration in include_events: - if current_includes_stack: last_path = current_includes_stack[-1] prev = path_to_time.get(last_path, 0) @@ -190,68 +201,109 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str return tree -def main(build_output_dir, html_output): - tree = generate_time_bloat(build_output_dir=build_output_dir) +def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], dict]: + print("Processing includes in {}".format(trace_path)) - shutil.copytree("html", html_output, dirs_exist_ok=True) - - with open(os.path.join(html_output, "bloat.json"), "w") as f: - f.write("var kTree = ") - json.dump(tree, f, indent=4) - - -def parse_includes(path: str) -> list[tuple[int, str]]: - print("Processing includes in {}".format(path)) - - with open(path) as f: + with open(trace_path) as f: obj = json.load(f) + cpp_file = None include_events = [] # (time, +-1, path) for event in obj["traceEvents"]: if event["name"] == "Source": path = event["args"]["detail"] + path = sanitize_path(path, base_src_dir) time_stamp = event["ts"] duration = event["dur"] include_events.append((time_stamp, +1, path)) include_events.append((time_stamp + duration, -1, path)) - include_events.sort(key=lambda event: (event[0], -event[1])) + if event["name"] == "OptModule": + cpp_file = event["args"]["detail"] path_to_time = {} - current_includes_stack = [] - last_time_stamp = None + last_time_stamp = 0 + time_breakdown = {} # header/cpp -> (header -> (cnt, total time)) + if cpp_file is None: + print("Can't determine cpp file for {}".format(trace_path)) + return path_to_time, time_breakdown + + include_events.sort(key=lambda event: (event[0], -event[1])) + + cpp_file = sanitize_path(cpp_file, base_src_dir) + current_includes_stack = [(cpp_file, 0)] for time_stamp, ev, path in include_events: if current_includes_stack: - last_path = current_includes_stack[-1] + last_path, _ = current_includes_stack[-1] prev = path_to_time.get(last_path, 0) path_to_time[last_path] = prev + (time_stamp - last_time_stamp) / 1000 / 1000 + # add compile breakdown for itself + if last_path not in time_breakdown: + time_breakdown[last_path] = {} + + if last_path not in time_breakdown[last_path]: + time_breakdown[last_path][last_path] = [0, 0] + + time_breakdown[last_path][last_path][0] = 1 # NB: just 1 + time_breakdown[last_path][last_path][1] += (time_stamp - last_time_stamp) / 1000 / 1000 + if ev == 1: - current_includes_stack.append(path) + current_includes_stack.append((path, time_stamp)) else: - assert current_includes_stack[-1] == path + current_path, include_ts = current_includes_stack[-1] + assert current_path == path current_includes_stack.pop() + parent_path = current_includes_stack[-1][0] + if parent_path not in time_breakdown: + time_breakdown[parent_path] = {} + + if current_path not in time_breakdown[parent_path]: + time_breakdown[parent_path][current_path] = [0, 0] + + time_breakdown[parent_path][current_path][0] += 1 + time_breakdown[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000 + last_time_stamp = time_stamp - return path_to_time + return path_to_time, time_breakdown def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) path_to_stat = {} # header path -> (total_duration, count) + total_time_breakdown = {} # header/cpp path -> (header -> (inclusion count, time spend) ) with ProcessPoolExecutor() as executor: - res = executor.map(parse_includes, time_trace_paths) - for fn_res in res: - for path, duration in fn_res.items(): - path = sanitize_path(path, base_src_dir) + fn = partial(parse_includes, base_src_dir=base_src_dir) + res = executor.map(fn, time_trace_paths) + for path_to_time, time_breakdown in res: + for path, duration in path_to_time.items(): if path not in path_to_stat: path_to_stat[path] = [0, 0] path_to_stat[path][0] += duration path_to_stat[path][1] += 1 + for path in time_breakdown: + if path not in total_time_breakdown: + total_time_breakdown[path] = {} + + for subpath in time_breakdown[path]: + if subpath not in total_time_breakdown[path]: + total_time_breakdown[path][subpath] = [0, 0] + + total_time_breakdown[path][subpath][0] += time_breakdown[path][subpath][0] + total_time_breakdown[path][subpath][1] += time_breakdown[path][subpath][1] + + for path in total_time_breakdown: + print("*** {}".format(path)) + for subpath in total_time_breakdown[path]: + count, total_time_ms = total_time_breakdown[path][subpath] + print(" {} -> total {:.2f}s (included {} times)".format(subpath, total_time_ms, count)) + print("") + result = [] for path, (duration, cnt) in path_to_stat.items(): @@ -260,6 +312,8 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: tree = {} + headers_compile_duration = [] + for duration, cnt, path in result: path_chunks = path.split(os.sep) path_chunks[-1] = path_chunks[-1] + " (total {} times)".format(cnt) @@ -267,6 +321,37 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: chunks = list(zip(path_chunks, (path_chunks_count - 1) * ["dir"] + ["h"])) add_to_tree(chunks, int(duration * 1000), tree) print("{} -> {:.2f}s (aggregated {} times)".format(path, duration, cnt)) + headers_compile_duration.append( + { + "path": path, + "inclusion_count": cnt, + "mean_compilation_time_s": duration / cnt, + } + ) + + time_breakdown = {} + + for path in total_time_breakdown: + one_file_breakdown = [] + for subpath in total_time_breakdown[path]: + inclusion_count, total_s = total_time_breakdown[path][subpath] + one_file_breakdown.append( + { + "path": subpath, + "inclusion_count": inclusion_count, + "total_time_s": total_s, + } + ) + one_file_breakdown.sort(key=lambda val: -val["total_time_s"]) + time_breakdown[path] = one_file_breakdown + + human_readable_output = { + "headers_compile_duration": headers_compile_duration, + "time_breakdown": time_breakdown, + } + + with open(os.path.join(result_dir, "output.json"), "w") as f: + json.dump(human_readable_output, f, indent=4) propogate_area(tree) enrich_names_with_sec(tree) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index c4f235c68dde..0ac50cb10a3a 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -29,15 +29,23 @@ "id", "git_commit_message", "path", + "sub_path", ] DATETIME_COLUMNS = [ "git_commit_time", ] -UINT64_COLUMNS = [] +UINT64_COLUMNS = [ + "inclusion_count", +] -DOUBLE_COLUMNS = ["total_compilation_time_s", "compilation_time_s"] +DOUBLE_COLUMNS = [ + "total_compilation_time_s", + "compilation_time_s", + "mean_compilation_time_s", + "total_time_s", +] ALL_COLUMNS = UTF8_COLUMNS + DATETIME_COLUMNS + UINT64_COLUMNS @@ -125,8 +133,12 @@ def main(): with open(os.path.join(args.html_dir_cpp, "output.json")) as f: cpp_stats = json.load(f) + with open(os.path.join(args.html_dir_headers, "output.json")) as f: + header_stats = json.load(f) + rows = [] + # upload into cpp_compile_time for entry in cpp_stats["cpp_compilation_times"]: path = entry["path"] time_s = entry["time_s"] @@ -142,6 +154,7 @@ def main(): DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row) ) + # upload into total_compile_time row = copy.copy(common_parameters) row["id"] = str(uuid.uuid4()) row["total_compilation_time_s"] = cpp_stats["total_compilation_time"] @@ -150,6 +163,49 @@ def main(): DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row) ) + # upload into headers_impact + rows = [] + for entry in header_stats["headers_compile_duration"]: + path = entry["path"] + inclusion_count = entry["inclusion_count"] + mean_compilation_time_s = entry["mean_compilation_time_s"] + row = copy.copy(common_parameters) + row["id"] = str(uuid.uuid4()) + row["path"] = sanitize_str(path) + row["mean_compilation_time_s"] = mean_compilation_time_s + row["inclusion_count"] = inclusion_count + rows.append(copy.copy(row)) + + if rows: + row = rows[0] + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/headers_impact", rows, generate_column_types(row) + ) + + # upload into compile_breakdown + rows = [] + for path in header_stats["time_breakdown"]: + entry = header_stats["time_breakdown"][path] + for sub_entry in entry: + sub_path = sub_entry["path"] + inclusion_count = sub_entry["inclusion_count"] + total_time_s = sub_entry["total_time_s"] + + row = copy.copy(common_parameters) + row["id"] = str(uuid.uuid4()) + row["path"] = path + row["sub_path"] = sub_path + row["inclusion_count"] = inclusion_count + row["total_time_s"] = total_time_s + + rows.append(copy.copy(row)) + + if rows: + row = rows[0] + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/compile_breakdown", rows, generate_column_types(row) + ) + if __name__ == "__main__": exit(main())