From d6e4134f03fd5362062c62e44cd07f7b4ae2fbc5 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 09:04:01 +0000 Subject: [PATCH 01/16] ydb upload --- ydb/ci/build_bloat/main.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index fa8f606933aa..9f8dd2b40342 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -144,7 +144,7 @@ def gather_time_traces(build_output_dir: str) -> list[str]: return time_trace_paths -def generate_cpp_bloat(build_output_dir: str) -> dict: +def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) result = [] @@ -157,6 +157,8 @@ def generate_cpp_bloat(build_output_dir: str) -> dict: result.sort() tree = {"name": "/"} + + cpp_compilation_times_ms = [] for duration, path, time_trace_path in result: splitted = path.split(os.sep) @@ -222,7 +224,7 @@ def parse_includes(path: str) -> list[tuple[int, str]]: return path_to_time -def generate_header_bloat(build_output_dir: str) -> dict: +def generate_header_bloat(build_output_dir: str, result_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) path_to_stat = {} # header path -> (total_duration, count) @@ -299,11 +301,14 @@ def main(): if args.html_dir_cpp: actions.append(("header build time impact", generate_header_bloat, args.html_dir_headers)) + current_script_dir = os.path.dirname(os.path.realpath(__file__)) + html_dir = os.path.join(current_script_dir, "html") + for description, fn, output_path in actions: print("Performing '{}'".format(description)) - tree = fn(args.build_dir) + tree = fn(args.build_dir, output_path) - shutil.copytree("html", output_path, dirs_exist_ok=True) + shutil.copytree(html_dir, output_path, dirs_exist_ok=True) with open(os.path.join(output_path, "bloat.json"), "w") as f: f.write("var kTree = ") json.dump(tree, f, indent=4) From 56956838bc7149bae1536592bbd525950acf9fd7 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 11:11:38 +0000 Subject: [PATCH 02/16] fix --- ydb/ci/build_bloat/main.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 9f8dd2b40342..dcdaef9273b3 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -10,14 +10,6 @@ def sanitize_path(path: str, base_dir: str) -> str: - prefixes_to_remove = [ - base_dir, - os.path.abspath(base_dir), - ] - - for prefix in prefixes_to_remove: - path = path.removeprefix(prefix) - ya_build_path_chunk = ".ya/build/build_root" if ya_build_path_chunk in path: # remove path to before .ya @@ -27,8 +19,15 @@ def sanitize_path(path: str, base_dir: str) -> str: splitted = path.split(os.sep) del splitted[3:5] path = os.sep.join(splitted) + else: + # dirty hack: remove all before ydb (repo name) including ydb + ydb_repo_chunk = "ydb/" + if ydb_repo_chunk in path: + # remove path to before ydb with ydb + path = path[path.find(ydb_repo_chunk) + len(ydb_repo_chunk) :] + - return "root" + "/" + path + return "ydb/" + path def get_compile_duration_and_cpp_path(time_trace_path: str) -> tuple[float, str, str]: @@ -158,7 +157,7 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: tree = {"name": "/"} - cpp_compilation_times_ms = [] + cpp_compilation_times = [] for duration, path, time_trace_path in result: splitted = path.split(os.sep) @@ -169,7 +168,20 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: additional_chunks = list(zip(inc_path, "h" * len(inc_path))) add_to_tree(chunks + additional_chunks, inc_duration / 1000, tree) print("{} -> {:.2f}s".format(path, duration)) + cpp_compilation_times.append({ + "path": path, + "time_s": duration, + }) + os.makedirs(result_dir, exist_ok=True) + + human_readable_output = { + "cpp_compilation_times": cpp_compilation_times, + } + + with open(os.path.join(result_dir, "output.json"), "w") as f: + json.dump(human_readable_output, f, indent=4) + propogate_area(tree) enrich_names_with_sec(tree) From fca353fcebd6e573f53fd4e9685a7ca8d8312024 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 13:37:28 +0000 Subject: [PATCH 03/16] wip --- ydb/ci/build_bloat/main.py | 3 + ydb/ci/build_bloat/ydb_upload.py | 130 +++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100755 ydb/ci/build_bloat/ydb_upload.py diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index dcdaef9273b3..73a032f7b0fc 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -158,6 +158,7 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: tree = {"name": "/"} cpp_compilation_times = [] + total_compilation_time = 0.0 for duration, path, time_trace_path in result: splitted = path.split(os.sep) @@ -172,10 +173,12 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: "path": path, "time_s": duration, }) + total_compilation_time += duration os.makedirs(result_dir, exist_ok=True) human_readable_output = { + "total_compilation_time": total_compilation_time, "cpp_compilation_times": cpp_compilation_times, } diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py new file mode 100755 index 000000000000..477e47c09054 --- /dev/null +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 + +import copy +import datetime +import json +import os +import ydb +import uuid +import subprocess + +DATABASE_PATH = "/ru-central1/b1ggceeul2pkher8vhb6/etn6d1qbals0c29ho4lf" + +FROM_ENV_COLUMNS = [ + "github_head_ref", + "github_workflow", + "github_workflow_ref", + "github_sha", + "github_repository", + "github_event_name", + "github_ref_type", + "github_ref_name", + "github_ref", +] + +UTF8_COLUMNS = FROM_ENV_COLUMNS + [ + "id", + "git_commit_message", + "path", + "build_preset", +] + +DATETIME_COLUMNS = [ + "git_commit_time", +] + +UINT64_COLUMNS = [ +] + +DOUBLE_COLUMNS = [ + "compilation_time_s" +] + +ALL_COLUMNS = UTF8_COLUMNS + DATETIME_COLUMNS + UINT64_COLUMNS + + +def sanitize_str(s): + return s or "N\\A" + # YDB SDK expects bytes for 'String' columns + if s is None: + s = "N\A" + return s.encode("utf-8") + + +def main(): + if "CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS" not in os.environ: + print("Env variable CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS is missing, skipping") + return 1 + + # Do not set up 'real' variable from gh workflows because it interfere with ydb tests + # So, set up it locally + os.environ["YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS"] = os.environ["CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS"] + + with ydb.Driver( + endpoint="grpcs://ydb.serverless.yandexcloud.net:2135", + database=DATABASE_PATH, + credentials=ydb.credentials_from_env_variables() + ) as driver: + driver.wait(timeout=10, fail_fast=True) + session = ydb.retry_operation_sync( + lambda: driver.table_client.session().create() + ) + + column_types = ydb.BulkUpsertColumns() + for type_ in UTF8_COLUMNS: + column_types = column_types.add_column(type_, ydb.PrimitiveType.Utf8) + for type_ in UINT64_COLUMNS: + column_types = column_types.add_column(type_, ydb.PrimitiveType.Uint64) + for type_ in DATETIME_COLUMNS: + column_types = column_types.add_column(type_, ydb.PrimitiveType.Datetime) + for type_ in DOUBLE_COLUMNS: + column_types = column_types.add_column(type_, ydb.PrimitiveType.Double) + + build_preset = os.environ.get("build_preset", None) + github_sha = os.environ.get("GITHUB_SHA", None) + + if github_sha is not None: + git_commit_time_bytes = subprocess.check_output( + ["git", "show", "--no-patch", "--format=%cI", github_sha] + ) + git_commit_message_bytes = subprocess.check_output( + ["git", "log", "--format=%s", "-n", "1", github_sha] + ) + git_commit_time = datetime.datetime.fromisoformat( + git_commit_time_bytes.decode("utf-8").strip() + ) + git_commit_message = git_commit_message_bytes.decode("utf-8").strip() + git_commit_time_unix = int(git_commit_time.timestamp()) + else: + git_commit_time = None + git_commit_message = None + git_commit_time_unix = 0 + + common_parameters = { + "build_preset": sanitize_str(build_preset), + "git_commit_time": git_commit_time_unix, + "git_commit_message": sanitize_str(git_commit_message), + } + + for column in FROM_ENV_COLUMNS: + value = os.environ.get(column.upper(), None) + common_parameters[column] = sanitize_str(value) + + with open("html_cpp_impact/output.json") as f: + cpp_stats = json.load(f) + + rows = [] + + for entry in cpp_stats["cpp_compilation_times"]: + path = entry["path"] + time_s = entry["time_s"] + parameters["path"] = sanitize_str(path) + parameters["compilation_time_s"] = time_s + parameters["id"] = str(uuid.uuid4()) + rows.append(copy.copy(parameters)) + + driver.table_client.bulk_upsert(DATABASE_PATH + "/cpp_compile_time", rows, column_types) + + +if __name__ == "__main__": + exit(main()) From 0d915ce11130e7d549d2e99e977f54d87ed3227b Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 14:44:34 +0000 Subject: [PATCH 04/16] wip --- .github/actions/build_analytics/action.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/actions/build_analytics/action.yml b/.github/actions/build_analytics/action.yml index 2813fb4b62ac..8d0d4bde4098 100644 --- a/.github/actions/build_analytics/action.yml +++ b/.github/actions/build_analytics/action.yml @@ -33,8 +33,11 @@ runs: # FIXME: target name may be not the same as dir name export TARGET_NAME=`basename ${{ inputs.build_target }}` export TARGET_DIR=${{ inputs.build_target }} + export build_preset="${{ inputs.build_preset }}" + export build_target="${{ inputs.build_target }}" ./ya tool bloat --linker-map $TARGET_DIR/$TARGET_NAME.map.lld --input $TARGET_DIR/$TARGET_NAME --save-html ya_bloat_html ./ydb/ci/build_bloat/main.py --build-dir . --html-dir-cpp html_cpp_impact --html-dir-headers html_headers_impact + ./ydb/ci/build_bloat/upload_ydb.py --html-dir-cpp html_cpp_impact --html-dir-headers html_headers_impact - name: Upload results shell: bash From eefc12117634403bb3cfdb3ed79731cc062def48 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 14:45:16 +0000 Subject: [PATCH 05/16] wip --- ydb/ci/build_bloat/ydb_upload.py | 41 ++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index 477e47c09054..bd0e02df9832 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -37,6 +37,7 @@ ] DOUBLE_COLUMNS = [ + "total_compilation_time_s", "compilation_time_s" ] @@ -45,10 +46,21 @@ def sanitize_str(s): return s or "N\\A" - # YDB SDK expects bytes for 'String' columns - if s is None: - s = "N\A" - return s.encode("utf-8") + +def generate_column_types(row): + column_types = ydb.BulkUpsertColumns() + for column_name in row: + if column_name in UTF8_COLUMNS: + column_types = column_types.add_column(column_name, ydb.PrimitiveType.Utf8) + elif column_name in UINT64_COLUMNS: + column_types = column_types.add_column(column_name, ydb.PrimitiveType.Uint64) + elif column_name in DOUBLE_COLUMNS: + column_types = column_types.add_column(column_name, ydb.PrimitiveType.Double) + elif column_name in DATETIME_COLUMNS: + column_types = column_types.add_column(column_name, ydb.PrimitiveType.Datetime) + else: + assert False + return column_types def main(): @@ -66,9 +78,6 @@ def main(): credentials=ydb.credentials_from_env_variables() ) as driver: driver.wait(timeout=10, fail_fast=True) - session = ydb.retry_operation_sync( - lambda: driver.table_client.session().create() - ) column_types = ydb.BulkUpsertColumns() for type_ in UTF8_COLUMNS: @@ -118,13 +127,21 @@ def main(): for entry in cpp_stats["cpp_compilation_times"]: path = entry["path"] time_s = entry["time_s"] - parameters["path"] = sanitize_str(path) - parameters["compilation_time_s"] = time_s - parameters["id"] = str(uuid.uuid4()) - rows.append(copy.copy(parameters)) + row = copy.copy(common_parameters) + row["path"] = sanitize_str(path) + row["compilation_time_s"] = time_s + row["id"] = str(uuid.uuid4()) + rows.append(copy.copy(row)) + + if rows: + row = rows[0] + driver.table_client.bulk_upsert(DATABASE_PATH + "/cpp_compile_time", rows, generate_column_types(row)) - driver.table_client.bulk_upsert(DATABASE_PATH + "/cpp_compile_time", rows, column_types) + row = copy.copy(common_parameters) + row["id"] = str(uuid.uuid4()) + row["total_compilation_time_s"] = cpp_stats["total_compilation_time"] + driver.table_client.bulk_upsert(DATABASE_PATH + "/total_compile_time", [row], generate_column_types(row)) if __name__ == "__main__": exit(main()) From 62f32deafcfae312db43efb1d81dd084068f146a Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 16:09:14 +0000 Subject: [PATCH 06/16] fix --- ydb/ci/build_bloat/ydb_upload.py | 65 ++++++++++++++------------------ 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index bd0e02df9832..41ab38ad2cdc 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -11,35 +11,32 @@ DATABASE_PATH = "/ru-central1/b1ggceeul2pkher8vhb6/etn6d1qbals0c29ho4lf" FROM_ENV_COLUMNS = [ - "github_head_ref", - "github_workflow", - "github_workflow_ref", - "github_sha", - "github_repository", - "github_event_name", - "github_ref_type", - "github_ref_name", - "github_ref", + "GITHUB_HEAD_REF", + "GITHUB_WORKFLOW", + "GITHUB_WORKFLOW_REF", + "GITHUB_SHA", + "GITHUB_REPOSITORY", + "GITHUB_EVENT_NAME", + "GITHUB_REF_TYPE", + "GITHUB_REF_NAME", + "GITHUB_REF", + "build_preset", + "build_target", ] -UTF8_COLUMNS = FROM_ENV_COLUMNS + [ +UTF8_COLUMNS = [val.lower() for val in FROM_ENV_COLUMNS] + [ "id", "git_commit_message", "path", - "build_preset", ] DATETIME_COLUMNS = [ "git_commit_time", ] -UINT64_COLUMNS = [ -] +UINT64_COLUMNS = [] -DOUBLE_COLUMNS = [ - "total_compilation_time_s", - "compilation_time_s" -] +DOUBLE_COLUMNS = ["total_compilation_time_s", "compilation_time_s"] ALL_COLUMNS = UTF8_COLUMNS + DATETIME_COLUMNS + UINT64_COLUMNS @@ -47,6 +44,7 @@ def sanitize_str(s): return s or "N\\A" + def generate_column_types(row): column_types = ydb.BulkUpsertColumns() for column_name in row: @@ -67,15 +65,15 @@ def main(): if "CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS" not in os.environ: print("Env variable CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS is missing, skipping") return 1 - - # Do not set up 'real' variable from gh workflows because it interfere with ydb tests + + # Do not set up 'real' variable from gh workflows because it interfere with ydb tests # So, set up it locally os.environ["YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS"] = os.environ["CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS"] with ydb.Driver( endpoint="grpcs://ydb.serverless.yandexcloud.net:2135", database=DATABASE_PATH, - credentials=ydb.credentials_from_env_variables() + credentials=ydb.credentials_from_env_variables(), ) as driver: driver.wait(timeout=10, fail_fast=True) @@ -93,22 +91,16 @@ def main(): github_sha = os.environ.get("GITHUB_SHA", None) if github_sha is not None: - git_commit_time_bytes = subprocess.check_output( - ["git", "show", "--no-patch", "--format=%cI", github_sha] - ) - git_commit_message_bytes = subprocess.check_output( - ["git", "log", "--format=%s", "-n", "1", github_sha] - ) - git_commit_time = datetime.datetime.fromisoformat( - git_commit_time_bytes.decode("utf-8").strip() - ) + git_commit_time_bytes = subprocess.check_output(["git", "show", "--no-patch", "--format=%cI", github_sha]) + git_commit_message_bytes = subprocess.check_output(["git", "log", "--format=%s", "-n", "1", github_sha]) + git_commit_time = datetime.datetime.fromisoformat(git_commit_time_bytes.decode("utf-8").strip()) git_commit_message = git_commit_message_bytes.decode("utf-8").strip() git_commit_time_unix = int(git_commit_time.timestamp()) else: git_commit_time = None git_commit_message = None git_commit_time_unix = 0 - + common_parameters = { "build_preset": sanitize_str(build_preset), "git_commit_time": git_commit_time_unix, @@ -116,8 +108,8 @@ def main(): } for column in FROM_ENV_COLUMNS: - value = os.environ.get(column.upper(), None) - common_parameters[column] = sanitize_str(value) + value = os.environ.get(column, None) + common_parameters[column.lower()] = sanitize_str(value) with open("html_cpp_impact/output.json") as f: cpp_stats = json.load(f) @@ -132,16 +124,17 @@ def main(): row["compilation_time_s"] = time_s row["id"] = str(uuid.uuid4()) rows.append(copy.copy(row)) - + if rows: row = rows[0] - driver.table_client.bulk_upsert(DATABASE_PATH + "/cpp_compile_time", rows, generate_column_types(row)) - + driver.table_client.bulk_upsert(DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row)) + row = copy.copy(common_parameters) row["id"] = str(uuid.uuid4()) row["total_compilation_time_s"] = cpp_stats["total_compilation_time"] - driver.table_client.bulk_upsert(DATABASE_PATH + "/total_compile_time", [row], generate_column_types(row)) + driver.table_client.bulk_upsert(DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row)) + if __name__ == "__main__": exit(main()) From c6eccf63a674a35b52c31f1652a44614df1bfe3a Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 28 May 2024 16:16:08 +0000 Subject: [PATCH 07/16] argparse --- ydb/ci/build_bloat/ydb_upload.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index 41ab38ad2cdc..f2e45de0ecf6 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import argparse import copy import datetime import json @@ -61,7 +62,27 @@ def generate_column_types(row): return column_types +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--html-dir-cpp", + required=True, + help="Path to treemap view of compilation times", + ) + parser.add_argument( + "-i", + "--html-dir-headers", + required=False, + default="html_headers_impact", + help="Path to treemap view of headers impact on cpp compilation", + ) + return parser.parse_args() + + def main(): + args = parse_args() + if "CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS" not in os.environ: print("Env variable CI_YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS is missing, skipping") return 1 @@ -111,7 +132,7 @@ def main(): value = os.environ.get(column, None) common_parameters[column.lower()] = sanitize_str(value) - with open("html_cpp_impact/output.json") as f: + with open(os.path.join(args.html_dir_cpp, "output.json")) as f: cpp_stats = json.load(f) rows = [] From bd9ff7482f4be8543f1c999507a6f9a3748c85ca Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Wed, 29 May 2024 16:21:46 +0000 Subject: [PATCH 08/16] fix --- .github/actions/build_analytics/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/build_analytics/action.yml b/.github/actions/build_analytics/action.yml index 8d0d4bde4098..b6a4e105ecc9 100644 --- a/.github/actions/build_analytics/action.yml +++ b/.github/actions/build_analytics/action.yml @@ -37,7 +37,7 @@ runs: export build_target="${{ inputs.build_target }}" ./ya tool bloat --linker-map $TARGET_DIR/$TARGET_NAME.map.lld --input $TARGET_DIR/$TARGET_NAME --save-html ya_bloat_html ./ydb/ci/build_bloat/main.py --build-dir . --html-dir-cpp html_cpp_impact --html-dir-headers html_headers_impact - ./ydb/ci/build_bloat/upload_ydb.py --html-dir-cpp html_cpp_impact --html-dir-headers html_headers_impact + ./ydb/ci/build_bloat/ydb_upload.py --html-dir-cpp html_cpp_impact --html-dir-headers html_headers_impact - name: Upload results shell: bash From f6770f2453bec4b7793294da74acf807c3009cdf Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Wed, 29 May 2024 16:46:54 +0000 Subject: [PATCH 09/16] remove uneeded --- ydb/ci/build_bloat/ydb_upload.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index f2e45de0ecf6..8b1dc6b10501 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -98,16 +98,6 @@ def main(): ) as driver: driver.wait(timeout=10, fail_fast=True) - column_types = ydb.BulkUpsertColumns() - for type_ in UTF8_COLUMNS: - column_types = column_types.add_column(type_, ydb.PrimitiveType.Utf8) - for type_ in UINT64_COLUMNS: - column_types = column_types.add_column(type_, ydb.PrimitiveType.Uint64) - for type_ in DATETIME_COLUMNS: - column_types = column_types.add_column(type_, ydb.PrimitiveType.Datetime) - for type_ in DOUBLE_COLUMNS: - column_types = column_types.add_column(type_, ydb.PrimitiveType.Double) - build_preset = os.environ.get("build_preset", None) github_sha = os.environ.get("GITHUB_SHA", None) From 0c059e7cfd962c3b109272df8e95407f45c3c3c2 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Thu, 30 May 2024 11:27:40 +0000 Subject: [PATCH 10/16] fixes --- ydb/ci/build_bloat/main.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 73a032f7b0fc..e1a33c9397c7 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -9,7 +9,7 @@ HEADER_COMPILE_TIME_TO_SHOW = 0.5 # sec -def sanitize_path(path: str, base_dir: str) -> str: +def sanitize_path(path: str, base_src_dir: str) -> str: ya_build_path_chunk = ".ya/build/build_root" if ya_build_path_chunk in path: # remove path to before .ya @@ -20,14 +20,11 @@ def sanitize_path(path: str, base_dir: str) -> str: del splitted[3:5] path = os.sep.join(splitted) else: - # dirty hack: remove all before ydb (repo name) including ydb - ydb_repo_chunk = "ydb/" - if ydb_repo_chunk in path: - # remove path to before ydb with ydb - path = path[path.find(ydb_repo_chunk) + len(ydb_repo_chunk) :] + if not base_src_dir.endswith("/"): + base_src_dir += "/" + path = path.removeprefix(base_src_dir) - - return "ydb/" + path + return "src/" + path def get_compile_duration_and_cpp_path(time_trace_path: str) -> tuple[float, str, str]: @@ -90,7 +87,7 @@ def enrich_names_with_sec(tree): tree["name"] = tree["name"] + " " + "{:_} ms".format(tree["data"]["$area"]) -def build_include_tree(path: str, build_output_dir: str) -> list: +def build_include_tree(path: str, build_output_dir: str, base_src_dir: str) -> list: with open(path) as f: obj = json.load(f) @@ -120,11 +117,11 @@ def build_include_tree(path: str, build_output_dir: str) -> list: path_to_time[last_path] = prev + (time_stamp - last_time_stamp) / 1000 / 1000 if ev == 1: - current_includes_stack.append(sanitize_path(path, build_output_dir)) + current_includes_stack.append(sanitize_path(path, base_src_dir)) if duration > HEADER_COMPILE_TIME_TO_SHOW * 1000 * 1000: result.append((current_includes_stack[:], duration)) else: - assert current_includes_stack[-1] == sanitize_path(path, build_output_dir) + assert current_includes_stack[-1] == sanitize_path(path, base_src_dir) current_includes_stack.pop() last_time_stamp = time_stamp @@ -143,14 +140,14 @@ def gather_time_traces(build_output_dir: str) -> list[str]: return time_trace_paths -def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: +def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) result = [] with ProcessPoolExecutor() as executor: res = executor.map(get_compile_duration_and_cpp_path, time_trace_paths) for duration, path, time_trace_path in res: - path = sanitize_path(path, base_dir=build_output_dir) + path = sanitize_path(path, base_src_dir) result.append((duration, path, time_trace_path)) result.sort() @@ -164,7 +161,7 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str) -> dict: splitted = path.split(os.sep) chunks = list(zip(splitted, (len(splitted) - 1) * ["dir"] + ["cpp"])) add_to_tree(chunks, int(duration * 1000), tree) - include_tree = build_include_tree(time_trace_path, build_output_dir) + include_tree = build_include_tree(time_trace_path, build_output_dir, base_src_dir) for inc_path, inc_duration in include_tree: additional_chunks = list(zip(inc_path, "h" * len(inc_path))) add_to_tree(chunks + additional_chunks, inc_duration / 1000, tree) @@ -239,7 +236,7 @@ def parse_includes(path: str) -> list[tuple[int, str]]: return path_to_time -def generate_header_bloat(build_output_dir: str, result_dir: str) -> dict: +def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) path_to_stat = {} # header path -> (total_duration, count) @@ -247,7 +244,7 @@ def generate_header_bloat(build_output_dir: str, result_dir: str) -> dict: res = executor.map(parse_includes, time_trace_paths) for fn_res in res: for path, duration in fn_res.items(): - path = sanitize_path(path, build_output_dir) + path = sanitize_path(path, base_src_dir) if path not in path_to_stat: path_to_stat[path] = [0, 0] path_to_stat[path][0] += duration @@ -317,11 +314,14 @@ def main(): actions.append(("header build time impact", generate_header_bloat, args.html_dir_headers)) current_script_dir = os.path.dirname(os.path.realpath(__file__)) + base_src_dir = os.path.normpath(os.path.join(current_script_dir, "../../..")) + # check we a in root of source tree + assert os.path.isfile(os.path.join(base_src_dir, "AUTHORS")) html_dir = os.path.join(current_script_dir, "html") for description, fn, output_path in actions: print("Performing '{}'".format(description)) - tree = fn(args.build_dir, output_path) + tree = fn(args.build_dir, output_path, base_src_dir) shutil.copytree(html_dir, output_path, dirs_exist_ok=True) with open(os.path.join(output_path, "bloat.json"), "w") as f: From 3ee9bbc29b1c855094fcfe5d7f4b586404eee755 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Thu, 30 May 2024 15:29:28 +0000 Subject: [PATCH 11/16] black --- ydb/ci/build_bloat/main.py | 16 +++++++++------- ydb/ci/build_bloat/ydb_upload.py | 8 ++++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index e1a33c9397c7..2e0f831c0d83 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -153,8 +153,8 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str result.sort() tree = {"name": "/"} - - cpp_compilation_times = [] + + cpp_compilation_times = [] total_compilation_time = 0.0 for duration, path, time_trace_path in result: @@ -166,10 +166,12 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str additional_chunks = list(zip(inc_path, "h" * len(inc_path))) add_to_tree(chunks + additional_chunks, inc_duration / 1000, tree) print("{} -> {:.2f}s".format(path, duration)) - cpp_compilation_times.append({ - "path": path, - "time_s": duration, - }) + cpp_compilation_times.append( + { + "path": path, + "time_s": duration, + } + ) total_compilation_time += duration os.makedirs(result_dir, exist_ok=True) @@ -181,7 +183,7 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str with open(os.path.join(result_dir, "output.json"), "w") as f: json.dump(human_readable_output, f, indent=4) - + propogate_area(tree) enrich_names_with_sec(tree) diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index 8b1dc6b10501..c4f235c68dde 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -138,13 +138,17 @@ def main(): if rows: row = rows[0] - driver.table_client.bulk_upsert(DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row)) + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row) + ) row = copy.copy(common_parameters) row["id"] = str(uuid.uuid4()) row["total_compilation_time_s"] = cpp_stats["total_compilation_time"] - driver.table_client.bulk_upsert(DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row)) + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row) + ) if __name__ == "__main__": From 29603a20caabcd8a6caf425a8764de474f63093b Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Mon, 3 Jun 2024 13:30:15 +0000 Subject: [PATCH 12/16] wip --- ydb/ci/build_bloat/main.py | 110 ++++++++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 2e0f831c0d83..0cba745fc9f2 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -2,6 +2,7 @@ import argparse import json +from functools import partial import os import shutil from concurrent.futures import ProcessPoolExecutor @@ -10,7 +11,9 @@ def sanitize_path(path: str, base_src_dir: str) -> str: + ya_build_path_chunk = ".ya/build/build_root" + ya_tools_path_chunk = ".ya/tools" if ya_build_path_chunk in path: # remove path to before .ya path = path[path.find(ya_build_path_chunk) :] @@ -19,6 +22,14 @@ def sanitize_path(path: str, base_src_dir: str) -> str: splitted = path.split(os.sep) del splitted[3:5] path = os.sep.join(splitted) + elif ya_tools_path_chunk in path: + # remove path to before .ya + path = path[path.find(ya_tools_path_chunk) :] + + # remove temporary nodes dir names + splitted = path.split(os.sep) + del splitted[3] + path = os.sep.join(splitted) else: if not base_src_dir.endswith("/"): base_src_dir += "/" @@ -190,68 +201,95 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str return tree -def main(build_output_dir, html_output): - tree = generate_time_bloat(build_output_dir=build_output_dir) - - shutil.copytree("html", html_output, dirs_exist_ok=True) - - with open(os.path.join(html_output, "bloat.json"), "w") as f: - f.write("var kTree = ") - json.dump(tree, f, indent=4) - - -def parse_includes(path: str) -> list[tuple[int, str]]: +def parse_includes(path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], dict]: print("Processing includes in {}".format(path)) with open(path) as f: obj = json.load(f) + cpp_file = "N\\A" include_events = [] # (time, +-1, path) for event in obj["traceEvents"]: if event["name"] == "Source": path = event["args"]["detail"] + path = sanitize_path(path, base_src_dir) time_stamp = event["ts"] duration = event["dur"] include_events.append((time_stamp, +1, path)) include_events.append((time_stamp + duration, -1, path)) + if event["name"] == "OptModule": + cpp_file = event["args"]["detail"] + include_events.sort(key=lambda event: (event[0], -event[1])) path_to_time = {} - current_includes_stack = [] - last_time_stamp = None + current_includes_stack = [(cpp_file, 0)] + last_time_stamp = 0 + time_parts = {} # header/cpp -> (header -> (cnt, total time)) for time_stamp, ev, path in include_events: if current_includes_stack: - last_path = current_includes_stack[-1] + last_path, _ = current_includes_stack[-1] prev = path_to_time.get(last_path, 0) path_to_time[last_path] = prev + (time_stamp - last_time_stamp) / 1000 / 1000 if ev == 1: - current_includes_stack.append(path) + current_includes_stack.append((path, time_stamp)) else: - assert current_includes_stack[-1] == path + current_path, include_ts = current_includes_stack[-1] + assert current_path == path current_includes_stack.pop() + parent_path = current_includes_stack[-1][0] + if parent_path not in time_parts: + time_parts[parent_path] = {} + + if current_path not in time_parts[parent_path]: + time_parts[parent_path][current_path] = [0, 0] + + time_parts[parent_path][current_path][0] += 1 + time_parts[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000 + last_time_stamp = time_stamp - return path_to_time + return path_to_time, time_parts def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) path_to_stat = {} # header path -> (total_duration, count) + total_time_parts = {} # header/cpp path -> (header -> (inclusion count, time spend) ) with ProcessPoolExecutor() as executor: - res = executor.map(parse_includes, time_trace_paths) - for fn_res in res: - for path, duration in fn_res.items(): - path = sanitize_path(path, base_src_dir) + fn = partial(parse_includes, base_src_dir=base_src_dir) + res = executor.map(fn, time_trace_paths) + for path_to_time, time_parts in res: + for path, duration in path_to_time.items(): if path not in path_to_stat: path_to_stat[path] = [0, 0] path_to_stat[path][0] += duration path_to_stat[path][1] += 1 + for path in time_parts: + if path not in total_time_parts: + total_time_parts[path] = {} + + for subpath in time_parts[path]: + if subpath not in total_time_parts[path]: + total_time_parts[path][subpath] = [0, 0] + + total_time_parts[path][subpath][0] += time_parts[path][subpath][0] + total_time_parts[path][subpath][1] += time_parts[path][subpath][1] + + for path in total_time_parts: + print("*** {}".format(path)) + for subpath in total_time_parts[path]: + count, total_time_ms = total_time_parts[path][subpath] + print(" {} -> total {:.2f}s (included {} times)".format(subpath, total_time_ms, count)) + print("") + + result = [] for path, (duration, cnt) in path_to_stat.items(): @@ -260,6 +298,8 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: tree = {} + headers_compile_duration = [] + for duration, cnt, path in result: path_chunks = path.split(os.sep) path_chunks[-1] = path_chunks[-1] + " (total {} times)".format(cnt) @@ -267,6 +307,34 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: chunks = list(zip(path_chunks, (path_chunks_count - 1) * ["dir"] + ["h"])) add_to_tree(chunks, int(duration * 1000), tree) print("{} -> {:.2f}s (aggregated {} times)".format(path, duration, cnt)) + headers_compile_duration.append({ + "path": path, + "inclusion_count": cnt, + "mean_compilation_time_s": duration / cnt, + }) + + time_parts = {} + + for path in total_time_parts: + time_part = [] + for subpath in total_time_parts[path]: + inclusion_count, total_s = total_time_parts[path][subpath] + time_part.append({ + "path": subpath, + "inclusion_count": inclusion_count, + "total_time_s": total_s, + }) + time_part.sort(key=lambda val: -val["total_time_s"]) + time_parts[path] = time_part + + + human_readable_output = { + "headers_compile_duration": headers_compile_duration, + "time_parts": time_parts, + } + + with open(os.path.join(result_dir, "output.json"), "w") as f: + json.dump(human_readable_output, f, indent=4) propogate_area(tree) enrich_names_with_sec(tree) From cb668065ed6d2de1fb611003d1c04001c6230af6 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Mon, 3 Jun 2024 13:34:41 +0000 Subject: [PATCH 13/16] Put build analytics results in GITHUB_STEP_SUMMARY --- .github/actions/build_analytics/action.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/build_analytics/action.yml b/.github/actions/build_analytics/action.yml index b6a4e105ecc9..1576a8018899 100644 --- a/.github/actions/build_analytics/action.yml +++ b/.github/actions/build_analytics/action.yml @@ -44,11 +44,11 @@ runs: run: | set -ex s3cmd sync -r --acl-public --stats --no-progress --no-mime-magic --guess-mime-type --no-check-md5 "ya_bloat_html/" "$S3_BUCKET_PATH/ya_bloat_html/" - echo "ya_bloat=$S3_URL_PREFIX/ya_bloat_html/index.html" >> $GITHUB_OUTPUT - + echo '[ya bloat tool]($S3_URL_PREFIX/ya_bloat_html/index.html)' >> $GITHUB_STEP_SUMMARY + s3cmd sync -r --acl-public --stats --no-progress --no-mime-magic --guess-mime-type --no-check-md5 "html_cpp_impact/" "$S3_BUCKET_PATH/html_cpp_impact/" - echo "cpp_impact=$S3_URL_PREFIX/html_cpp_impact/index.html" >> $GITHUB_OUTPUT - + echo '[cpp compilation time]($S3_URL_PREFIX/html_cpp_impact/index.html)' >> $GITHUB_STEP_SUMMARY + s3cmd sync -r --acl-public --stats --no-progress --no-mime-magic --guess-mime-type --no-check-md5 "html_headers_impact/" "$S3_BUCKET_PATH/html_headers_impact/" - echo "headers_impact=$S3_URL_PREFIX/html_headers_impact/index.html" >> $GITHUB_OUTPUT + echo '[headers impact]($S3_URL_PREFIX/html_headers_impact/index.html)' >> $GITHUB_STEP_SUMMARY From 06b94d514dc2224e584771378608b8b00e865da1 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Mon, 3 Jun 2024 17:02:35 +0000 Subject: [PATCH 14/16] wip --- ydb/ci/build_bloat/main.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 0cba745fc9f2..2a0baba340c5 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -9,9 +9,8 @@ HEADER_COMPILE_TIME_TO_SHOW = 0.5 # sec - def sanitize_path(path: str, base_src_dir: str) -> str: - + home_dir = os.environ["HOME"] ya_build_path_chunk = ".ya/build/build_root" ya_tools_path_chunk = ".ya/tools" if ya_build_path_chunk in path: @@ -34,6 +33,7 @@ def sanitize_path(path: str, base_src_dir: str) -> str: if not base_src_dir.endswith("/"): base_src_dir += "/" path = path.removeprefix(base_src_dir) + path = path.removeprefix(home_dir) return "src/" + path @@ -201,13 +201,13 @@ def generate_cpp_bloat(build_output_dir: str, result_dir: str, base_src_dir: str return tree -def parse_includes(path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], dict]: - print("Processing includes in {}".format(path)) +def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], dict]: + print("Processing includes in {}".format(trace_path)) - with open(path) as f: + with open(trace_path) as f: obj = json.load(f) - cpp_file = "N\\A" + cpp_file = None include_events = [] # (time, +-1, path) for event in obj["traceEvents"]: @@ -221,13 +221,16 @@ def parse_includes(path: str, base_src_dir: str) -> tuple[list[tuple[int, str]], if event["name"] == "OptModule": cpp_file = event["args"]["detail"] - + include_events.sort(key=lambda event: (event[0], -event[1])) path_to_time = {} current_includes_stack = [(cpp_file, 0)] last_time_stamp = 0 time_parts = {} # header/cpp -> (header -> (cnt, total time)) + if cpp_file is None: + print("Can't determine cpp file for {}".format(trace_path)) + return path_to_time, time_parts for time_stamp, ev, path in include_events: if current_includes_stack: From 743d47e85208c0e6af06662918fbd1e6f82edd97 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 4 Jun 2024 08:40:23 +0000 Subject: [PATCH 15/16] rename --- ydb/ci/build_bloat/main.py | 64 +++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index 2a0baba340c5..f8c10e231816 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -227,10 +227,10 @@ def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, path_to_time = {} current_includes_stack = [(cpp_file, 0)] last_time_stamp = 0 - time_parts = {} # header/cpp -> (header -> (cnt, total time)) + time_breakdown = {} # header/cpp -> (header -> (cnt, total time)) if cpp_file is None: print("Can't determine cpp file for {}".format(trace_path)) - return path_to_time, time_parts + return path_to_time, time_breakdown for time_stamp, ev, path in include_events: if current_includes_stack: @@ -245,50 +245,50 @@ def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, assert current_path == path current_includes_stack.pop() parent_path = current_includes_stack[-1][0] - if parent_path not in time_parts: - time_parts[parent_path] = {} + if parent_path not in time_breakdown: + time_breakdown[parent_path] = {} - if current_path not in time_parts[parent_path]: - time_parts[parent_path][current_path] = [0, 0] + if current_path not in time_breakdown[parent_path]: + time_breakdown[parent_path][current_path] = [0, 0] - time_parts[parent_path][current_path][0] += 1 - time_parts[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000 - + time_breakdown[parent_path][current_path][0] += 1 + time_breakdown[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000 + last_time_stamp = time_stamp - return path_to_time, time_parts + return path_to_time, time_breakdown def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: str) -> dict: time_trace_paths = gather_time_traces(build_output_dir) path_to_stat = {} # header path -> (total_duration, count) - total_time_parts = {} # header/cpp path -> (header -> (inclusion count, time spend) ) + total_time_breakdown = {} # header/cpp path -> (header -> (inclusion count, time spend) ) with ProcessPoolExecutor() as executor: fn = partial(parse_includes, base_src_dir=base_src_dir) res = executor.map(fn, time_trace_paths) - for path_to_time, time_parts in res: + for path_to_time, time_breakdown in res: for path, duration in path_to_time.items(): if path not in path_to_stat: path_to_stat[path] = [0, 0] path_to_stat[path][0] += duration path_to_stat[path][1] += 1 - for path in time_parts: - if path not in total_time_parts: - total_time_parts[path] = {} + for path in time_breakdown: + if path not in total_time_breakdown: + total_time_breakdown[path] = {} - for subpath in time_parts[path]: - if subpath not in total_time_parts[path]: - total_time_parts[path][subpath] = [0, 0] + for subpath in time_breakdown[path]: + if subpath not in total_time_breakdown[path]: + total_time_breakdown[path][subpath] = [0, 0] - total_time_parts[path][subpath][0] += time_parts[path][subpath][0] - total_time_parts[path][subpath][1] += time_parts[path][subpath][1] + total_time_breakdown[path][subpath][0] += time_breakdown[path][subpath][0] + total_time_breakdown[path][subpath][1] += time_breakdown[path][subpath][1] - for path in total_time_parts: + for path in total_time_breakdown: print("*** {}".format(path)) - for subpath in total_time_parts[path]: - count, total_time_ms = total_time_parts[path][subpath] + for subpath in total_time_breakdown[path]: + count, total_time_ms = total_time_breakdown[path][subpath] print(" {} -> total {:.2f}s (included {} times)".format(subpath, total_time_ms, count)) print("") @@ -316,24 +316,24 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: "mean_compilation_time_s": duration / cnt, }) - time_parts = {} + time_breakdown = {} - for path in total_time_parts: - time_part = [] - for subpath in total_time_parts[path]: - inclusion_count, total_s = total_time_parts[path][subpath] - time_part.append({ + for path in total_time_breakdown: + one_file_breakdown = [] + for subpath in total_time_breakdown[path]: + inclusion_count, total_s = total_time_breakdown[path][subpath] + one_file_breakdown.append({ "path": subpath, "inclusion_count": inclusion_count, "total_time_s": total_s, }) - time_part.sort(key=lambda val: -val["total_time_s"]) - time_parts[path] = time_part + one_file_breakdown.sort(key=lambda val: -val["total_time_s"]) + time_breakdown[path] = one_file_breakdown human_readable_output = { "headers_compile_duration": headers_compile_duration, - "time_parts": time_parts, + "time_breakdown": time_breakdown, } with open(os.path.join(result_dir, "output.json"), "w") as f: From d0bbe8b8dfa744127e182840949ec27b2bc80301 Mon Sep 17 00:00:00 2001 From: Maxim Yurchuk Date: Tue, 4 Jun 2024 13:17:32 +0000 Subject: [PATCH 16/16] more --- ydb/ci/build_bloat/main.py | 54 +++++++++++++++++----------- ydb/ci/build_bloat/ydb_upload.py | 60 ++++++++++++++++++++++++++++++-- 2 files changed, 92 insertions(+), 22 deletions(-) diff --git a/ydb/ci/build_bloat/main.py b/ydb/ci/build_bloat/main.py index f8c10e231816..317bb7c12ba3 100755 --- a/ydb/ci/build_bloat/main.py +++ b/ydb/ci/build_bloat/main.py @@ -9,6 +9,7 @@ HEADER_COMPILE_TIME_TO_SHOW = 0.5 # sec + def sanitize_path(path: str, base_src_dir: str) -> str: home_dir = os.environ["HOME"] ya_build_path_chunk = ".ya/build/build_root" @@ -121,7 +122,6 @@ def build_include_tree(path: str, build_output_dir: str, base_src_dir: str) -> l result = [] for time_stamp, ev, path, duration in include_events: - if current_includes_stack: last_path = current_includes_stack[-1] prev = path_to_time.get(last_path, 0) @@ -222,22 +222,34 @@ def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, if event["name"] == "OptModule": cpp_file = event["args"]["detail"] - include_events.sort(key=lambda event: (event[0], -event[1])) - path_to_time = {} - current_includes_stack = [(cpp_file, 0)] last_time_stamp = 0 time_breakdown = {} # header/cpp -> (header -> (cnt, total time)) + if cpp_file is None: print("Can't determine cpp file for {}".format(trace_path)) return path_to_time, time_breakdown + include_events.sort(key=lambda event: (event[0], -event[1])) + + cpp_file = sanitize_path(cpp_file, base_src_dir) + current_includes_stack = [(cpp_file, 0)] for time_stamp, ev, path in include_events: if current_includes_stack: last_path, _ = current_includes_stack[-1] prev = path_to_time.get(last_path, 0) path_to_time[last_path] = prev + (time_stamp - last_time_stamp) / 1000 / 1000 + # add compile breakdown for itself + if last_path not in time_breakdown: + time_breakdown[last_path] = {} + + if last_path not in time_breakdown[last_path]: + time_breakdown[last_path][last_path] = [0, 0] + + time_breakdown[last_path][last_path][0] = 1 # NB: just 1 + time_breakdown[last_path][last_path][1] += (time_stamp - last_time_stamp) / 1000 / 1000 + if ev == 1: current_includes_stack.append((path, time_stamp)) else: @@ -247,13 +259,13 @@ def parse_includes(trace_path: str, base_src_dir: str) -> tuple[list[tuple[int, parent_path = current_includes_stack[-1][0] if parent_path not in time_breakdown: time_breakdown[parent_path] = {} - + if current_path not in time_breakdown[parent_path]: time_breakdown[parent_path][current_path] = [0, 0] - + time_breakdown[parent_path][current_path][0] += 1 time_breakdown[parent_path][current_path][1] += (time_stamp - include_ts) / 1000 / 1000 - + last_time_stamp = time_stamp return path_to_time, time_breakdown @@ -284,7 +296,7 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: total_time_breakdown[path][subpath][0] += time_breakdown[path][subpath][0] total_time_breakdown[path][subpath][1] += time_breakdown[path][subpath][1] - + for path in total_time_breakdown: print("*** {}".format(path)) for subpath in total_time_breakdown[path]: @@ -292,7 +304,6 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: print(" {} -> total {:.2f}s (included {} times)".format(subpath, total_time_ms, count)) print("") - result = [] for path, (duration, cnt) in path_to_stat.items(): @@ -310,11 +321,13 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: chunks = list(zip(path_chunks, (path_chunks_count - 1) * ["dir"] + ["h"])) add_to_tree(chunks, int(duration * 1000), tree) print("{} -> {:.2f}s (aggregated {} times)".format(path, duration, cnt)) - headers_compile_duration.append({ - "path": path, - "inclusion_count": cnt, - "mean_compilation_time_s": duration / cnt, - }) + headers_compile_duration.append( + { + "path": path, + "inclusion_count": cnt, + "mean_compilation_time_s": duration / cnt, + } + ) time_breakdown = {} @@ -322,15 +335,16 @@ def generate_header_bloat(build_output_dir: str, result_dir: str, base_src_dir: one_file_breakdown = [] for subpath in total_time_breakdown[path]: inclusion_count, total_s = total_time_breakdown[path][subpath] - one_file_breakdown.append({ - "path": subpath, - "inclusion_count": inclusion_count, - "total_time_s": total_s, - }) + one_file_breakdown.append( + { + "path": subpath, + "inclusion_count": inclusion_count, + "total_time_s": total_s, + } + ) one_file_breakdown.sort(key=lambda val: -val["total_time_s"]) time_breakdown[path] = one_file_breakdown - human_readable_output = { "headers_compile_duration": headers_compile_duration, "time_breakdown": time_breakdown, diff --git a/ydb/ci/build_bloat/ydb_upload.py b/ydb/ci/build_bloat/ydb_upload.py index c4f235c68dde..0ac50cb10a3a 100755 --- a/ydb/ci/build_bloat/ydb_upload.py +++ b/ydb/ci/build_bloat/ydb_upload.py @@ -29,15 +29,23 @@ "id", "git_commit_message", "path", + "sub_path", ] DATETIME_COLUMNS = [ "git_commit_time", ] -UINT64_COLUMNS = [] +UINT64_COLUMNS = [ + "inclusion_count", +] -DOUBLE_COLUMNS = ["total_compilation_time_s", "compilation_time_s"] +DOUBLE_COLUMNS = [ + "total_compilation_time_s", + "compilation_time_s", + "mean_compilation_time_s", + "total_time_s", +] ALL_COLUMNS = UTF8_COLUMNS + DATETIME_COLUMNS + UINT64_COLUMNS @@ -125,8 +133,12 @@ def main(): with open(os.path.join(args.html_dir_cpp, "output.json")) as f: cpp_stats = json.load(f) + with open(os.path.join(args.html_dir_headers, "output.json")) as f: + header_stats = json.load(f) + rows = [] + # upload into cpp_compile_time for entry in cpp_stats["cpp_compilation_times"]: path = entry["path"] time_s = entry["time_s"] @@ -142,6 +154,7 @@ def main(): DATABASE_PATH + "/code-agility/cpp_compile_time", rows, generate_column_types(row) ) + # upload into total_compile_time row = copy.copy(common_parameters) row["id"] = str(uuid.uuid4()) row["total_compilation_time_s"] = cpp_stats["total_compilation_time"] @@ -150,6 +163,49 @@ def main(): DATABASE_PATH + "/code-agility/total_compile_time", [row], generate_column_types(row) ) + # upload into headers_impact + rows = [] + for entry in header_stats["headers_compile_duration"]: + path = entry["path"] + inclusion_count = entry["inclusion_count"] + mean_compilation_time_s = entry["mean_compilation_time_s"] + row = copy.copy(common_parameters) + row["id"] = str(uuid.uuid4()) + row["path"] = sanitize_str(path) + row["mean_compilation_time_s"] = mean_compilation_time_s + row["inclusion_count"] = inclusion_count + rows.append(copy.copy(row)) + + if rows: + row = rows[0] + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/headers_impact", rows, generate_column_types(row) + ) + + # upload into compile_breakdown + rows = [] + for path in header_stats["time_breakdown"]: + entry = header_stats["time_breakdown"][path] + for sub_entry in entry: + sub_path = sub_entry["path"] + inclusion_count = sub_entry["inclusion_count"] + total_time_s = sub_entry["total_time_s"] + + row = copy.copy(common_parameters) + row["id"] = str(uuid.uuid4()) + row["path"] = path + row["sub_path"] = sub_path + row["inclusion_count"] = inclusion_count + row["total_time_s"] = total_time_s + + rows.append(copy.copy(row)) + + if rows: + row = rows[0] + driver.table_client.bulk_upsert( + DATABASE_PATH + "/code-agility/compile_breakdown", rows, generate_column_types(row) + ) + if __name__ == "__main__": exit(main())