From 862bfe04b7104f1c8fe26832a3a3812ff787ecd5 Mon Sep 17 00:00:00 2001
From: gmega <giuliano.mega@gmail.com>
Date: Thu, 9 Jan 2025 19:03:55 -0300
Subject: [PATCH] feat: add split log source command to CLI

---
 benchmarks/cli.py | 73 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 8 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 5691445..7030f3d 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -16,6 +16,11 @@
 )
 from benchmarks.deluge.config import DelugeExperimentConfig
 from benchmarks.deluge.logging import DelugeTorrentDownload
+from benchmarks.logging.sources import (
+    VectorFlatFileSource,
+    FSOutputManager,
+    split_logs_in_source,
+)
 
 config_parser = ConfigParser()
 config_parser.register(DelugeExperimentConfig)
@@ -55,7 +60,7 @@ def cmd_describe(args):
     print(config_parser.experiment_types[args.type].schema_json(indent=2))
 
 
-def cmd_logs(log: Path, output: Path):
+def cmd_parse_single_log(log: Path, output: Path):
     if not log.exists():
         print(f"Log file {log} does not exist.")
         sys.exit(-1)
@@ -77,6 +82,31 @@ def output_factory(event_type: str, format: LogSplitterFormats):
         splitter.split(log_parser.parse(istream))
 
 
+def cmd_parse_log_source(group_id: str, source_file: Path, output_dir: Path):
+    if not source_file.exists():
+        print(f"Log source file {source_file} does not exist.")
+        sys.exit(-1)
+
+    if not output_dir.parent.exists():
+        print(f"Folder {output_dir.parent} does not exist.")
+        sys.exit(-1)
+
+    output_dir.mkdir(exist_ok=True)
+
+    with (
+        source_file.open("r", encoding="utf-8") as istream,
+        FSOutputManager(output_dir) as output_manager,
+    ):
+        log_source = VectorFlatFileSource(app_name="codex-benchmarks", file=istream)
+        split_logs_in_source(
+            log_source,
+            log_parser,
+            output_manager,
+            group_id,
+            formats=[(DECLogEntry, LogSplitterFormats.jsonl)],
+        )
+
+
 def _parse_config(config: Path) -> Dict[str, ExperimentBuilder[Experiment]]:
     if not config.exists():
         print(f"Config file {config} does not exist.")
@@ -123,10 +153,10 @@ def main():
     run_cmd.add_argument("experiment", type=str, help="Name of the experiment to run.")
     run_cmd.set_defaults(func=lambda args: cmd_run(_parse_config(args.config), args))
 
-    describe = commands.add_parser(
+    describe_cmd = commands.add_parser(
         "describe", help="Shows the JSON schema for the various experiment types."
     )
-    describe.add_argument(
+    describe_cmd.add_argument(
         "type",
         type=str,
         help="Type of the experiment to describe.",
@@ -134,12 +164,39 @@ def main():
         nargs="?",
     )
 
-    describe.set_defaults(func=cmd_describe)
+    describe_cmd.set_defaults(func=cmd_describe)
 
-    logs = commands.add_parser("logs", help="Parse logs.")
-    logs.add_argument("log", type=Path, help="Path to the log file.")
-    logs.add_argument("output_dir", type=Path, help="Path to an output folder.")
-    logs.set_defaults(func=lambda args: cmd_logs(args.log, args.output_dir))
+    logs_cmd = commands.add_parser("logs", help="Parse logs.")
+    log_subcommands = logs_cmd.add_subparsers(required=True)
+
+    single_log_cmd = log_subcommands.add_parser(
+        "single", help="Parse a single log file."
+    )
+    single_log_cmd.add_argument("log", type=Path, help="Path to the log file.")
+    single_log_cmd.add_argument(
+        "output_dir", type=Path, help="Path to an output folder."
+    )
+    single_log_cmd.set_defaults(
+        func=lambda args: cmd_parse_single_log(args.log, args.output_dir)
+    )
+
+    log_source_cmd = log_subcommands.add_parser(
+        "source", help="Parse logs from a log source."
+    )
+    log_source_cmd.add_argument(
+        "source_file", type=Path, help="Vector log file to parse from."
+    )
+    log_source_cmd.add_argument(
+        "output_dir", type=Path, help="Path to an output folder."
+    )
+    log_source_cmd.add_argument(
+        "group_id", type=str, help="ID of experiment group to parse."
+    )
+    log_source_cmd.set_defaults(
+        func=lambda args: cmd_parse_log_source(
+            args.group_id, args.source_file, args.output_dir
+        )
+    )
 
     args = parser.parse_args()