Add SQLFLUFF linter for clickhouse queries (#6209)

add sqlfluff linter using clickhouse dialect on for use on linting clickhouse queries
pytorch · Jan 25, 2025 · f4c813d · f4c813d
1 parent cd68c38
commit f4c813d
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 8 deletions.
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -357,6 +357,25 @@ init_command = [
 ]
 is_formatter = true
 
+[[linter]]
+code = 'SQLFLUFF'
+# include_patterns = ['torchci/clickhouse_queries/**/*.sql']
+include_patterns = ['torchci/clickhouse_queries/workflow_load/query.sql']
+exclude_patterns = [
+]
+command = [
+    'python3',
+    'tools/linter/adapters/sqlfluff_linter.py',
+    '@{{PATHSFILE}}',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'sqlfluff==3.3.0',
+]
+is_formatter = true
+
 [[linter]]
 code = 'RUSTFMT'
 include_patterns = ['**/*.rs']

diff --git a/.sqlfluff b/.sqlfluff
@@ -0,0 +1,2 @@
+[sqlfluff]
+exclude_rules = capitalisation.functions
diff --git a/tools/linter/adapters/sqlfluff_linter.py b/tools/linter/adapters/sqlfluff_linter.py
@@ -0,0 +1,156 @@
+import argparse
+import concurrent.futures
+import json
+import logging
+import os
+import re
+import subprocess
+import tempfile
+import time
+from enum import Enum
+from typing import List, NamedTuple, Optional, Pattern
+
+from isort.api import _tmp_file
+
+
+LINTER_CODE = "SQLFLUFF"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+RESULTS_RE: Pattern[str] = re.compile(
+    r"""(?mx)
+    ^
+    (?P<file>.*?):
+    (?P<line>\d+):
+    (?P<char>\d+):
+    \s(?P<message>.*)
+    \s(?P<code>\[.*\])
+    $
+    """
+)
+
+
+def run_command(
+    args: List[str],
+) -> "subprocess.CompletedProcess[bytes]":
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            capture_output=True,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+def check_file(
+    filename: str,
+) -> List[LintMessage]:
+    with open(filename, "r") as f:
+        original = f.read()
+        original_edited = original.replace("{", "'{").replace("}", "}'")
+
+    tmp = tempfile.NamedTemporaryFile(suffix=".sql")
+    with open(tmp.name, "w") as f:
+        f.write(original_edited)
+    try:
+        proc = run_command(
+            [
+                "sqlfluff",
+                "format",
+                "--dialect",
+                "clickhouse",
+                tmp.name,
+            ]
+        )
+    except OSError as err:
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
+            )
+        ]
+
+    with open(tmp.name, "r") as f:
+        replacement = f.read().replace("'{", "{").replace("}'", "}")
+    if original == replacement:
+        return []
+    lint_message = proc.stdout
+
+    return [
+        LintMessage(
+            path=filename,
+            line=None,
+            char=None,
+            code=LINTER_CODE,
+            severity=LintSeverity.WARNING,
+            name="format",
+            original=original,
+            replacement=replacement,
+            description=lint_message.decode("utf-8"),
+        )
+    ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=f"sqlfluff format linter for sql queries.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    args = parser.parse_args()
+
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=os.cpu_count(),
+        thread_name_prefix="Thread",
+    ) as executor:
+        futures = {
+            executor.submit(
+                check_file,
+                filename,
+            ): filename
+            for filename in args.filenames
+        }
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                for lint_message in future.result():
+                    print(json.dumps(lint_message._asdict()), flush=True)
+            except Exception:
+                logging.critical('Failed at "%s".', futures[future])
+                raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchci/clickhouse_queries/workflow_load/query.sql b/torchci/clickhouse_queries/workflow_load/query.sql
@@ -4,18 +4,19 @@ SELECT
         workflow.created_at
     ) AS granularity_bucket,
     workflow.name,
-    COUNT(*) as count
+    COUNT(*) AS count
 FROM
-    default.workflow_run workflow final
+    default.workflow_run workflow FINAL
 WHERE
     -- optimization to make query faster
-    workflow.id in (
-        select id from materialized_views.workflow_run_by_created_at
-        where created_at >= {startTime: DateTime64(9)}
-        AND created_at <= {stopTime: DateTime64(9)}
+    workflow.id IN (
+        SELECT id FROM materialized_views.workflow_run_by_created_at
+        WHERE
+            created_at >= {startTime: DateTime64(9)}
+            AND created_at <= {stopTime: DateTime64(9)}
     )
     -- re check for final
-    and workflow.created_at >= {startTime: DateTime64(9)}
+    AND workflow.created_at >= {startTime: DateTime64(9)}
     AND workflow.created_at <= {stopTime: DateTime64(9)}
     AND workflow.name IN (
         'pull',
@@ -30,7 +31,7 @@ WHERE
         'rocm',
         'inductor-rocm'
     )
-    AND workflow.repository.'full_name' like {repo: String}
+    AND workflow.repository.'full_name' LIKE {repo: String}
 GROUP BY
     granularity_bucket,
     workflow.name
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[sqlfluff]
		exclude_rules = capitalisation.functions