diff --git a/docs/userguide.md b/docs/userguide.md index 648ca4e86d..cb548cdc57 100644 --- a/docs/userguide.md +++ b/docs/userguide.md @@ -307,3 +307,7 @@ A: You can use the existing Spark cluster without a separate deployment, but if Q: Can I mix Spark with TiKV? A: If TiDB and TiKV are overloaded and run critical online tasks, consider deploying TiSpark separately. You also need to consider using different NICs to ensure that OLTP's network resources are not compromised and affect online business. If the online business requirements are not high or the loading is not large enough, you can consider mixing TiSpark with TiKV deployment. + +Q: How to use PySpark with TiSpark? + +A: Please follow [TiSpark on PySpark](../python/README.md). \ No newline at end of file diff --git a/docs/userguide_spark2.1.md b/docs/userguide_spark2.1.md index 271524dfd3..0423396313 100644 --- a/docs/userguide_spark2.1.md +++ b/docs/userguide_spark2.1.md @@ -338,3 +338,7 @@ A: You can use the existing Spark cluster without a separate deployment, but if Q: Can I mix Spark with TiKV? A: If TiDB and TiKV are overloaded and run critical online tasks, consider deploying TiSpark separately. You also need to consider using different NICs to ensure that OLTP's network resources are not compromised and affect online business. If the online business requirements are not high or the loading is not large enough, you can consider mixing TiSpark with TiKV deployment. + +Q: How to use PySpark with TiSpark? + +A: Please follow [TiSpark on PySpark](../python/README_spark2.1.md). \ No newline at end of file diff --git a/python/README.md b/python/README.md index 10b8b0f901..b211d16fe6 100644 --- a/python/README.md +++ b/python/README.md @@ -1,16 +1,25 @@ ## TiSpark (version >= 2.0) on PySpark: **Note: If you are using TiSpark version less than 2.0, please read [this document](./README_spark2.1.md) instead** -pytispark will not be necessary since TiSpark version >= 2.0. ### Usage There are currently two ways to use TiSpark on Python: + #### Directly via pyspark This is the simplest way, just a decent Spark environment should be enough. 1. Make sure you have the latest version of [TiSpark](https://github.com/pingcap/tispark) and a `jar` with all TiSpark's dependencies. 2. Remember to add needed configurations listed in [README](../README.md) into your `$SPARK_HOME/conf/spark-defaults.conf` -3. Copy `./resources/session.py` to `$SPARK_HOME/python/pyspark/sql/session.py` +3. For spark-2.3.x please copy `./resources/spark-2.3/session.py` to `$SPARK_HOME/python/pyspark/sql/session.py`. For other Spark version please edit the file `$SPARK_HOME/python/pyspark/sql/session.py` and change it from +```python +jsparkSession = self._jvm.SparkSession(self._jsc.sc()) +``` + +to + +```python +jsparkSession = self._jvm.SparkSession.builder().getOrCreate() +``` 4. Run this command in your `$SPARK_HOME` directory: ``` @@ -36,7 +45,7 @@ spark.sql("select count(*) from customer").show() #### Via spark-submit This way is useful when you want to execute your own Python scripts. -Because of an open issue **[SPARK-25003]** in Spark 2.3, using spark-submit for python files will only support following api +Because of an open issue **[SPARK-25003]** in Spark-2.3.x and Spark-2.4.x, using spark-submit for python files will only support following api 1. Use ```pip install pytispark``` in your console to install `pytispark` @@ -46,7 +55,7 @@ Note that you may need reinstall `pytispark` if you meet `No plan for reation` e ```python import pytispark.pytispark as pti from pyspark.sql import SparkSession -spark = SparkSession.getOrCreate() +spark = SparkSession.builder.getOrCreate() ti = pti.TiContext(spark) ti.tidbMapDatabase("tpch_test") diff --git a/python/pytispark/__init__.py b/python/pytispark/__init__.py deleted file mode 100644 index 9e05cfe020..0000000000 --- a/python/pytispark/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -# Copyright 2017 PingCAP, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# See the License for the specific language governing permissions and -# limitations under the License. -# - -def main(): - """Entry point for the application script""" - print("Call your main application code here") diff --git a/python/pytispark/pytispark.py b/python/pytispark/pytispark.py deleted file mode 100644 index e978594efc..0000000000 --- a/python/pytispark/pytispark.py +++ /dev/null @@ -1,46 +0,0 @@ -# -# Copyright 2017 PingCAP, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from py4j.java_gateway import java_import -from pyspark.context import SparkContext - - -# TiContext -# Used for TiSpark -class TiContext: - """ - Create a new TiContext - :param sparkSession The spark session used for creating TiContext - """ - def __init__(self, sparkSession): - SparkContext._ensure_initialized() - gw = SparkContext._gateway - java_import(gw.jvm, "org.apache.spark.sql.TiExtensions") - self.ti = gw.jvm.TiExtensions.getInstance(sparkSession._jsparkSession).getOrCreateTiContext(sparkSession._jsparkSession) - - """ - Get the TiContext java representation - """ - def getContext(self): - return self.ti - - """ - Change TiContext designated database - :param dbName Database to map(switch to) - :param isPrefix Whether to use dbName As Prefix - :param loadStatistics Whether to use statistics information from TiDB - """ - def tidbMapDatabase(self, dbName, isPrefix=False, loadStatistics=True): - self.ti.tidbMapDatabase(dbName, isPrefix, loadStatistics) \ No newline at end of file diff --git a/python/resources/session.py b/python/resources/spark-2.3/session.py similarity index 100% rename from python/resources/session.py rename to python/resources/spark-2.3/session.py diff --git a/python/setup.cfg b/python/setup.cfg deleted file mode 100644 index 224a77957f..0000000000 --- a/python/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -description-file = README.md \ No newline at end of file diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index 5c1eba7b50..0000000000 --- a/python/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -from setuptools import setup -setup( - name='pytispark', - packages=['pytispark'], - version='2.0', - description='TiSpark support for python', - author='PingCAP', - author_email='novemser@gmail.com', - url='https://github.com/pingcap/tispark', - keywords=['tispark', 'spark', 'tidb', 'olap'], - license='Apache 2.0', - classifiers=[ - # How mature is this project? Common values are - # 3 - Alpha - # 4 - Beta - # 5 - Production/Stable - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - ], - install_requires=['pyspark==2.3.3', 'py4j==0.10.7'] -)