MobileTeleSystems · maxim-lixakov · Feb 1, 2024 · Jan 23, 2024 · Jan 24, 2024 · Jan 25, 2024
diff --git a/README.rst b/README.rst
@@ -337,6 +337,9 @@ Read data from MSSQL, transform & write to Hive.
         options=MSSQL.ReadOptions(fetchsize=10000),
     )
 
+    # checks that there is data in the table, otherwise raises exception
+    reader.raise_if_no_data()
+
     # Read data to DataFrame
     df = reader.run()
     df.printSchema()

diff --git a/docs/changelog/next_release/203.feature.rst b/docs/changelog/next_release/203.feature.rst
@@ -0,0 +1 @@
+Add ``has_data``, ``raise_if_no_data`` methods to ``DBReader`` class.
diff --git a/docs/db/db_reader.rst b/docs/db/db_reader.rst
@@ -9,6 +9,8 @@ DB Reader
 
     DBReader
     DBReader.run
+    DBReader.has_data
+    DBReader.raise_if_no_data
 
 .. autoclass:: DBReader
-    :members: run
+    :members: run, has_data, raise_if_no_data
diff --git a/onetl/connection/db_connection/mssql/dialect.py b/onetl/connection/db_connection/mssql/dialect.py
@@ -27,6 +27,29 @@ def get_partition_column_hash(self, partition_column: str, num_partitions: int)
     def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
         return f"{partition_column} % {num_partitions}"
 
+    def get_sql_query(
+        self,
+        table: str,
+        columns: list[str] | None = None,
+        where: str | list[str] | None = None,
+        hint: str | None = None,
+        limit: int | None = None,
+        compact: bool = False,
+    ) -> str:
+        query = super().get_sql_query(
+            table=table,
+            columns=columns,
+            where=where,
+            hint=hint,
+            limit=0 if limit == 0 else None,
+            compact=compact,
+        )
+        # MSSQL-specific handling for the LIMIT clause using TOP
+        if limit is not None and limit > 0:
+            query = query.replace("SELECT", f"SELECT TOP {limit}", 1)
+
+        return query
+
     def _serialize_datetime(self, value: datetime) -> str:
         result = value.isoformat()
         return f"CAST('{result}' AS datetime2)"

diff --git a/onetl/connection/db_connection/oracle/dialect.py b/onetl/connection/db_connection/oracle/dialect.py
@@ -33,12 +33,24 @@ def get_sql_query(
         new_columns = columns or ["*"]
         if len(new_columns) > 1:
             new_columns = [table + ".*" if column.strip() == "*" else column for column in new_columns]
+
+        where = where or []
+        if isinstance(where, str):
+            where = [where]
+
+        if limit is not None:
+            if limit == 0:
+                where = ["1=0"]
+            else:
+                # Oracle does not support LIMIT
+                where.append(f"ROWNUM <= {limit}")
+
         return super().get_sql_query(
             table=table,
             columns=new_columns,
             where=where,
             hint=hint,
-            limit=limit,
+            limit=None,
             compact=compact,
         )
 

diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py
@@ -17,6 +17,7 @@
     ContainsGetDFSchemaMethod,
     ContainsGetMinMaxValues,
 )
+from onetl.exception import NoDataError
 from onetl.hooks import slot, support_hooks
 from onetl.hwm import AutoDetectHWM, Edge, Window
 from onetl.impl import FrozenModel, GenericOptions
@@ -501,6 +502,90 @@ def validate_options(cls, options, values):
 
         return None
 
+    @slot
+    def has_data(self) -> bool:
+        """Returns ``True`` if there is some data in the source, ``False`` otherwise.
+
+        .. warning::
+
+               If :etl-entities:`hwm <hwm/index.html>` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy.
+
+        Raises
+        ------
+        RuntimeError
+
+            Current strategy is not compatible with HWM parameter.
+
+        Examples
+        --------
+
+        .. code:: python
+
+            reader = DBReader(...)
+
+            # handle situation when there is no data in the source
+            if reader.has_data():
+                df = reader.run()
+            else:
+                # implement your handling logic here
+                ...
+        """
+        self._check_strategy()
+
+        if not self._connection_checked:
+            self._log_parameters()
+            self.connection.check()
+
+        window, limit = self._calculate_window_and_limit()
+        if limit == 0:
+            return False
+
+        df = self.connection.read_source_as_df(
+            source=str(self.source),
+            columns=self.columns,
+            hint=self.hint,
+            where=self.where,
+            df_schema=self.df_schema,
+            window=window,
+            limit=1,
+            **self._get_read_kwargs(),
+        )
+
+        return bool(df.take(1))
+
+    @slot
+    def raise_if_no_data(self) -> None:
+        """Raises exception ``NoDataError`` if source does not contain any data.
+
+        .. warning::
+
+            If :etl-entities:`hwm <hwm/index.html>` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy.
+
+        Raises
+        ------
+         RuntimeError
+
+            Current strategy is not compatible with HWM parameter.
+
+        :obj:`onetl.exception.NoDataError`
+
+            There is no data in source.
+
+        Examples
+        --------
+
+        .. code:: python
+
+            reader = DBReader(...)
+
+            # before creating read SparkDF, ensure that there is some data in the source
+            reader.raise_if_no_data()
+            df = reader.run()
+        """
+
+        if not self.has_data():
+            raise NoDataError(f"No data in the source: {self.source}")
+
     @slot
     def run(self) -> DataFrame:
         """
@@ -510,6 +595,10 @@ def run(self) -> DataFrame:
 
             This method can return different results depending on :ref:`strategy`
 
+        .. warning::
+
+            If :etl-entities:`hwm <hwm/index.html>` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy.
+
         Returns
         -------
         df : pyspark.sql.dataframe.DataFrame
@@ -541,6 +630,12 @@ def run(self) -> DataFrame:
             self._connection_checked = True
 
         window, limit = self._calculate_window_and_limit()
+
+        # update the HWM with the stop value
+        if self.hwm and window:
+            strategy: HWMStrategy = StrategyManager.get_current()  # type: ignore[assignment]
+            strategy.update_hwm(window.stop_at.value)
+
         df = self.connection.read_source_as_df(
             source=str(self.source),
             columns=self.columns,
@@ -562,7 +657,9 @@ def _check_strategy(self):
 
         if self.hwm:
             if not isinstance(strategy, HWMStrategy):
-                raise RuntimeError(f"{class_name}(hwm=...) cannot be used with {strategy_name}")
+                raise RuntimeError(
+                    f"{class_name}(hwm=...) cannot be used with {strategy_name}. Check documentation DBReader.has_data(): https://onetl.readthedocs.io/en/stable/db/db_reader.html#onetl.db.db_reader.db_reader.DBReader.has_data.",
+                )
             self._prepare_hwm(strategy, self.hwm)
 
         elif isinstance(strategy, HWMStrategy):
@@ -578,7 +675,7 @@ def _prepare_hwm(self, strategy: HWMStrategy, hwm: ColumnHWM):
             strategy.fetch_hwm()
             return
 
-        if not isinstance(strategy.hwm, ColumnHWM) or strategy.hwm.name != hwm.name:
+        if not isinstance(strategy.hwm, (ColumnHWM, KeyValueHWM)) or strategy.hwm.name != hwm.name:
             # exception raised when inside one strategy >1 processes on the same table but with different hwm columns
             # are executed, example: test_postgres_strategy_incremental_hwm_set_twice
             error_message = textwrap.dedent(
@@ -673,7 +770,6 @@ def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]:  # no
         if start_value is not None and stop_value is not None:
             # we already have start and stop values, nothing to do
             window = Window(self.hwm.expression, start_from=strategy.current, stop_at=strategy.next)
-            strategy.update_hwm(window.stop_at.value)
             return window, None
 
         if not isinstance(self.connection, ContainsGetMinMaxValues):
@@ -737,7 +833,6 @@ def _calculate_window_and_limit(self) -> tuple[Window | None, int | None]:  # no
                 stop_at=Edge(value=max_value),
             )
 
-        strategy.update_hwm(window.stop_at.value)
         return window, None
 
     def _log_parameters(self) -> None:

diff --git a/tests/fixtures/processing/fixtures.py b/tests/fixtures/processing/fixtures.py
@@ -76,3 +76,35 @@ def load_table_data(prepare_schema_table, processing):
     )
 
     return prepare_schema_table
+
+
+@pytest.fixture
+def kafka_topic(processing, request):
+    topic = secrets.token_hex(6)
+    processing.create_topic(topic, num_partitions=1)
+
+    def delete_topic():
+        processing.delete_topic([topic])
+
+    request.addfinalizer(delete_topic)
+    return topic
+
+
+@pytest.fixture
+def kafka_dataframe_schema():
+    from pyspark.sql.types import (
+        FloatType,
+        LongType,
+        StringType,
+        StructField,
+        StructType,
+    )
+
+    return StructType(
+        [
+            StructField("id_int", LongType(), nullable=True),
+            StructField("text_string", StringType(), nullable=True),
+            StructField("hwm_int", LongType(), nullable=True),
+            StructField("float_value", FloatType(), nullable=True),
+        ],
+    )
diff --git a/.../tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/.../tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py
@@ -338,6 +338,11 @@ def test_clickhouse_reader_snapshot_nothing_to_read(spark, processing, prepare_s
     first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end)
     second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end)
 
+    with pytest.raises(Exception, match="No data in the source:"):
+        reader.raise_if_no_data()
+
+    assert not reader.has_data()
+
     # no data yet, nothing to read
     df = reader.run()
     assert not df.count()
@@ -352,8 +357,12 @@ def test_clickhouse_reader_snapshot_nothing_to_read(spark, processing, prepare_s
     # .run() is not called, but dataframes are lazy, so it now contains all data from the source
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
+    # check that read df has data
+    assert reader.has_data()
+
     # read data explicitly
     df = reader.run()
+
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
     # insert second span

diff --git a/...n/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py b/...n/tests_core_integration/tests_db_reader_integration/test_greenplum_reader_integration.py
@@ -280,6 +280,11 @@ def test_greenplum_reader_snapshot_nothing_to_read(spark, processing, prepare_sc
     first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end)
     second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end)
 
+    with pytest.raises(Exception, match="No data in the source:"):
+        reader.raise_if_no_data()
+
+    assert not reader.has_data()
+
     # no data yet, nothing to read
     df = reader.run()
     assert not df.count()
@@ -294,8 +299,12 @@ def test_greenplum_reader_snapshot_nothing_to_read(spark, processing, prepare_sc
     # .run() is not called, but dataframes are lazy, so it now contains all data from the source
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
+    # check that read df has data
+    assert reader.has_data()
+
     # read data explicitly
     df = reader.run()
+
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
     # insert second span

diff --git a/...ration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py b/...ration/tests_core_integration/tests_db_reader_integration/test_hive_reader_integration.py
@@ -220,6 +220,11 @@ def test_hive_reader_snapshot_nothing_to_read(spark, processing, prepare_schema_
     first_span = processing.create_pandas_df(min_id=first_span_begin, max_id=first_span_end)
     second_span = processing.create_pandas_df(min_id=second_span_begin, max_id=second_span_end)
 
+    with pytest.raises(Exception, match="No data in the source:"):
+        reader.raise_if_no_data()
+
+    assert not reader.has_data()
+
     # no data yet, nothing to read
     df = reader.run()
     assert not df.count()
@@ -234,8 +239,12 @@ def test_hive_reader_snapshot_nothing_to_read(spark, processing, prepare_schema_
     # .run() is not called, but dataframes are lazy, so it now contains all data from the source
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
+    # check that read df has data
+    assert reader.has_data()
+
     # read data explicitly
     df = reader.run()
+
     processing.assert_equal_df(df=df, other_frame=first_span, order_by="id_int")
 
     # insert second span