From 6639930d0e303ec808f1cf85ae086ecfed666e41 Mon Sep 17 00:00:00 2001 From: Ted Conbeer Date: Fri, 22 Nov 2024 14:14:23 -0700 Subject: [PATCH 1/3] fix: support databricks type hint comments --- src/sqlfmt/comment.py | 14 ++- src/sqlfmt/merger.py | 6 ++ .../unformatted/134_databricks_type_hints.sql | 102 ++++++++++++++++++ .../test_general_formatting.py | 1 + 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tests/data/unformatted/134_databricks_type_hints.sql diff --git a/src/sqlfmt/comment.py b/src/sqlfmt/comment.py index 165d7f2..a995bcd 100644 --- a/src/sqlfmt/comment.py +++ b/src/sqlfmt/comment.py @@ -24,14 +24,18 @@ def __str__(self) -> str: without preceding whitespace, with a single space between the marker and the comment text. """ - if self.is_multiline or self.formatting_disabled: - return f"{self.token.token}" + if ( + self.is_multiline + or self.formatting_disabled + or self.is_databricks_type_hint + ): + return self.token.token else: marker, comment_text = self._comment_parts() if comment_text: return f"{marker} {comment_text}" else: - return f"{marker}" + return marker def __len__(self) -> int: return len(str(self)) @@ -85,6 +89,10 @@ def is_multiline(self) -> bool: def is_c_style(self) -> bool: return self.token.token.startswith("/*") + @property + def is_databricks_type_hint(self) -> bool: + return self.token.token.startswith("/*+") + @property def is_inline(self) -> bool: return not self.is_standalone and not self.is_multiline and not self.is_c_style diff --git a/src/sqlfmt/merger.py b/src/sqlfmt/merger.py index 643676d..9736ce1 100644 --- a/src/sqlfmt/merger.py +++ b/src/sqlfmt/merger.py @@ -77,6 +77,12 @@ def _extract_components( raise CannotMergeException( "Can't merge lines with inline comments and other comments" ) + elif any( + [comment.is_databricks_type_hint for comment in line.comments] + ): + raise CannotMergeException( + "Can't merge lines with a databricks type hint comment" + ) elif ( len(line.comments) == 1 and len(line.nodes) > 1 diff --git a/tests/data/unformatted/134_databricks_type_hints.sql b/tests/data/unformatted/134_databricks_type_hints.sql new file mode 100644 index 0000000..751651b --- /dev/null +++ b/tests/data/unformatted/134_databricks_type_hints.sql @@ -0,0 +1,102 @@ +-- see: https://github.com/tconbeer/sqlfmt/issues/639 +-- source: https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-qry-select-hints.html +SELECT /*+ COALESCE(3) */ * FROM t; +SELECT /*+ REPARTITION(3) */ * FROM t; +SELECT /*+ REPARTITION(c) */ * FROM t; +SELECT /*+ REPARTITION(3, c) */ * FROM t; +SELECT /*+ REPARTITION_BY_RANGE(c) */ * FROM t; +SELECT /*+ REPARTITION_BY_RANGE(3, c) */ * FROM t; +SELECT /*+ BROADCAST(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ BROADCASTJOIN (t1) */ * FROM t1 left JOIN t2 ON t1.key = t2.key; +SELECT /*+ MAPJOIN(t2) */ * FROM t1 right JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle sort merge join +SELECT /*+ SHUFFLE_MERGE(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ MERGEJOIN(t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ MERGE(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle hash join +SELECT /*+ SHUFFLE_HASH(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle-and-replicate nested loop join +SELECT /*+ SHUFFLE_REPLICATE_NL(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ BROADCAST(t1), MERGE(t1, t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +)))))__SQLFMT_OUTPUT__((((( +-- see: https://github.com/tconbeer/sqlfmt/issues/639 +-- source: +-- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-qry-select-hints.html +select /*+ COALESCE(3) */ + * +from t +; +select /*+ REPARTITION(3) */ + * +from t +; +select /*+ REPARTITION(c) */ + * +from t +; +select /*+ REPARTITION(3, c) */ + * +from t +; +select /*+ REPARTITION_BY_RANGE(c) */ + * +from t +; +select /*+ REPARTITION_BY_RANGE(3, c) */ + * +from t +; +select /*+ BROADCAST(t1) */ + * +from t1 +inner join t2 on t1.key = t2.key +; +select /*+ BROADCASTJOIN (t1) */ + * +from t1 +left join t2 on t1.key = t2.key +; +select /*+ MAPJOIN(t2) */ + * +from t1 +right join t2 on t1.key = t2.key +; + +-- Join Hints for shuffle sort merge join +select /*+ SHUFFLE_MERGE(t1) */ + * +from t1 +inner join t2 on t1.key = t2.key +; +select /*+ MERGEJOIN(t2) */ + * +from t1 +inner join t2 on t1.key = t2.key +; +select /*+ MERGE(t1) */ + * +from t1 +inner join t2 on t1.key = t2.key +; + +-- Join Hints for shuffle hash join +select /*+ SHUFFLE_HASH(t1) */ + * +from t1 +inner join t2 on t1.key = t2.key +; + +-- Join Hints for shuffle-and-replicate nested loop join +select /*+ SHUFFLE_REPLICATE_NL(t1) */ + * +from t1 +inner join t2 on t1.key = t2.key +; +select /*+ BROADCAST(t1), MERGE(t1, t2) */ + * +from t1 +inner join t2 on t1.key = t2.key +; diff --git a/tests/functional_tests/test_general_formatting.py b/tests/functional_tests/test_general_formatting.py index 85138c2..fa2d0ac 100644 --- a/tests/functional_tests/test_general_formatting.py +++ b/tests/functional_tests/test_general_formatting.py @@ -53,6 +53,7 @@ "unformatted/131_assignment_statement.sql", "unformatted/132_spark_number_literals.sql", "unformatted/133_for_else.sql", + "unformatted/134_databricks_type_hints.sql", "unformatted/200_base_model.sql", "unformatted/201_basic_snapshot.sql", "unformatted/202_unpivot_macro.sql", From c5810c35fc90558ebdf6dee89ff8caa5b816097d Mon Sep 17 00:00:00 2001 From: Ted Conbeer Date: Fri, 22 Nov 2024 14:16:30 -0700 Subject: [PATCH 2/3] chore: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3dc4b6..dbc7f80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. - sqlfmt no longer adds a space between the function name and parens for `filter()` and `isnull()` (but it also permits `filter ()` and `isnull ()` to support dialects where those are operators, not function names) ([#641](https://github.com/tconbeer/sqlfmt/issues/641) - thank you [@williamscs](https://github.com/williamscs) and [@hongtron](https://github.com/hongtron)!). - sqlfmt now supports Spark type-hinted numeric literals like `32y` and `+3.2e6bd` and will not introduce a space between the digits and their type suffix ([#640](https://github.com/tconbeer/sqlfmt/issues/640) - thank you [@ShaneMazur](https://github.com/ShaneMazur)!). +- sqlfmt now supports Databricks query hint comments like `/*+ COALESCE(3) */` ([#639](https://github.com/tconbeer/sqlfmt/issues/639) - thank you [@wr-atlas](https://github.com/wr-atlas)!). ## [0.23.3] - 2024-11-12 From 551f31e8fcd294e99ceb83ff7b340bde92a59349 Mon Sep 17 00:00:00 2001 From: Ted Conbeer Date: Fri, 22 Nov 2024 14:33:13 -0700 Subject: [PATCH 3/3] fix: add unit test for coverage --- src/sqlfmt/comment.py | 4 +-- src/sqlfmt/merger.py | 2 +- .../test_no_merge_databricks_query_hints.sql | 5 +++ tests/unit_tests/test_comment.py | 31 +++++++++++++++++++ tests/unit_tests/test_merger.py | 12 +++++++ 5 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 tests/data/unit_tests/test_merger/test_no_merge_databricks_query_hints.sql diff --git a/src/sqlfmt/comment.py b/src/sqlfmt/comment.py index a995bcd..5afc3d6 100644 --- a/src/sqlfmt/comment.py +++ b/src/sqlfmt/comment.py @@ -27,7 +27,7 @@ def __str__(self) -> str: if ( self.is_multiline or self.formatting_disabled - or self.is_databricks_type_hint + or self.is_databricks_query_hint ): return self.token.token else: @@ -90,7 +90,7 @@ def is_c_style(self) -> bool: return self.token.token.startswith("/*") @property - def is_databricks_type_hint(self) -> bool: + def is_databricks_query_hint(self) -> bool: return self.token.token.startswith("/*+") @property diff --git a/src/sqlfmt/merger.py b/src/sqlfmt/merger.py index 9736ce1..b2916c9 100644 --- a/src/sqlfmt/merger.py +++ b/src/sqlfmt/merger.py @@ -78,7 +78,7 @@ def _extract_components( "Can't merge lines with inline comments and other comments" ) elif any( - [comment.is_databricks_type_hint for comment in line.comments] + [comment.is_databricks_query_hint for comment in line.comments] ): raise CannotMergeException( "Can't merge lines with a databricks type hint comment" diff --git a/tests/data/unit_tests/test_merger/test_no_merge_databricks_query_hints.sql b/tests/data/unit_tests/test_merger/test_no_merge_databricks_query_hints.sql new file mode 100644 index 0000000..144380f --- /dev/null +++ b/tests/data/unit_tests/test_merger/test_no_merge_databricks_query_hints.sql @@ -0,0 +1,5 @@ +select /*+ my query hint */ + 1 +)))))__SQLFMT_OUTPUT__((((( +select + 1 diff --git a/tests/unit_tests/test_comment.py b/tests/unit_tests/test_comment.py index b6d994c..53ba67a 100644 --- a/tests/unit_tests/test_comment.py +++ b/tests/unit_tests/test_comment.py @@ -2,6 +2,7 @@ import pytest from sqlfmt.comment import Comment +from sqlfmt.node import Node from sqlfmt.node_manager import NodeManager from sqlfmt.token import Token, TokenType @@ -60,6 +61,30 @@ def multiline_comment() -> Comment: return comment +@pytest.fixture +def datbricks_query_hint_comment() -> Comment: + n = Node( + Token( + type=TokenType.UNTERM_KEYWORD, + prefix="", + token="select", + spos=0, + epos=6, + ), + previous_node=None, + prefix="", + value="select", + open_brackets=[], + open_jinja_blocks=[], + formatting_disabled=[], + ) + t = Token( + type=TokenType.COMMENT, prefix=" ", token="/*+ hint here */", spos=6, epos=23 + ) + comment = Comment(t, is_standalone=False, previous_node=n) + return comment + + @pytest.fixture def fmt_disabled_comment() -> Comment: t = Token(type=TokenType.FMT_OFF, prefix="", token="--fmt: off", spos=0, epos=10) @@ -81,11 +106,13 @@ def test_get_marker( short_mysql_comment: Comment, nospace_comment: Comment, short_js_comment: Comment, + datbricks_query_hint_comment: Comment, ) -> None: assert short_comment._get_marker() == ("--", 3) assert short_mysql_comment._get_marker() == ("#", 2) assert short_js_comment._get_marker() == ("//", 3) assert nospace_comment._get_marker() == ("--", 2) + assert datbricks_query_hint_comment._get_marker() == ("/*", 2) def test_comment_parts( @@ -210,3 +237,7 @@ def test_no_wrap_long_jinja_comments() -> None: rendered = comment.render_standalone(88, "") assert rendered == comment_str + "\n" + + +def test_no_add_space_databricks_hint(datbricks_query_hint_comment: Comment) -> None: + assert str(datbricks_query_hint_comment) == datbricks_query_hint_comment.token.token diff --git a/tests/unit_tests/test_merger.py b/tests/unit_tests/test_merger.py index c42d2af..fb22708 100644 --- a/tests/unit_tests/test_merger.py +++ b/tests/unit_tests/test_merger.py @@ -560,3 +560,15 @@ def test_do_not_merge_operator_sequences_across_commas(merger: LineMerger) -> No merged_lines = merger.maybe_merge_lines(raw_query.lines) result_string = "".join([str(line) for line in merged_lines]) assert result_string == expected_string + + +def test_do_not_merge_databricks_query_hints(merger: LineMerger) -> None: + source_string, expected_string = read_test_data( + "unit_tests/test_merger/test_no_merge_databricks_query_hints.sql" + ) + raw_query = merger.mode.dialect.initialize_analyzer( + merger.mode.line_length + ).parse_query(source_string) + merged_lines = merger.maybe_merge_lines(raw_query.lines) + result_string = "".join([str(line) for line in merged_lines]) + assert result_string == expected_string