wfau · Zarquan · Jul 6, 2020 · Apr 30, 2020 · Apr 30, 2020 · Apr 30, 2020
diff --git a/experiments/stv/gdaf-openstack/config/requirements.txt b/experiments/stv/gdaf-openstack/config/requirements.txt
@@ -0,0 +1,59 @@
+asn1crypto==0.24.0
+attrs==18.2.0
+Babel==2.6.0
+boto==2.45.0
+cffi==1.11.5
+chardet==3.0.4
+cloud-init==17.1
+conda==4.6.14
+configobj==5.0.6
+cryptography==2.6.1
+Cython==0.29.16
+cytoolz==0.9.0.1
+dbus-python==1.2.8
+distro==1.4.0
+frozendict==1.2
+gpg==1.12.0
+hdbscan==0.8.26
+heat-cfntools==1.4.2
+idna==2.7
+Jinja2==2.10
+joblib==0.14.1
+jsonpatch==1.21
+jsonpointer==1.10
+jsonschema==3.0.1
+MarkupSafe==1.1.1
+numpy==1.18.2
+oauthlib==2.1.0
+pandas==1.0.3
+pbr==5.1.2
+ply==3.11
+prettytable==0.7.2
+psutil==5.4.3
+py-cpuinfo==4.0.0
+pyarrow==0.17.0
+pyasn1==0.4.4
+pycosat==0.6.3
+pycparser==2.14
+pycrypto==2.6.1
+PyJWT==1.7.1
+pyrsistent==0.14.11
+pyserial==3.4
+PySocks==1.6.8
+python-dateutil==2.8.0
+pytz==2018.5
+PyYAML==5.1
+requests==2.21.0
+rpm==4.14.2.1
+rsa==3.4.2
+ruamel.yaml==0.16.5
+ruamel.yaml.clib==0.1.2
+scikit-learn==0.22.2.post1
+scipy==1.4.1
+sepolicy==1.1
+setools==4.1.1
+six==1.12.0
+toolz==0.9.0
+tqdm==4.37.0
+urllib3==1.24.1
+
diff --git a/experiments/stv/gdaf-openstack/tests/src/spark-tests/HOWTO.md b/experiments/stv/gdaf-openstack/tests/src/spark-tests/HOWTO.md
@@ -0,0 +1,7 @@
+
+### Running a PySpark test 
+
+From the test directory, and assuming we are running on a node with PySpark configured:
+
+`spark-submit test_pi.py`
+
diff --git a/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_pi.py b/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_pi.py
@@ -0,0 +1,42 @@
+import testbase
+import unittest
+from datetime import datetime
+import random
+
+def inside(p):
+    """
+    Helper method for piCalculation
+    """
+    x, y = random.random(), random.random()
+    return x*x + y*y < 1
+
+
+def piCalculation(sc, sample):
+    """
+    Calculate the value of pi
+    """
+    count = sc.parallelize(xrange(0, sample)) \
+                 .filter(inside).count()
+    return 4.0 * count / sample
+
+class TestRandomWordCount(testbase.PySparkTestBase):
+
+    def test_word_count(self):
+        """
+        Test the word count method
+        Assert that the time taken doesn't exceed x seconds
+
+        """
+        #TODO: make these configurable
+        _CHANGE_ME_max_seconds = 5
+        _CHANGE_ME_sample = 100000
+
+        tick = datetime.now()
+        piCalculation(self.sc, _CHANGE_ME_sample)
+        tock = datetime.now()
+        diff = tock - tick
+        self.assertTrue(diff.seconds <=  _CHANGE_ME_max_seconds)
+
+if __name__ == '__main__':
+    if __name__ == '__main__':
+        unittest.main()
diff --git a/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_random_word_count.py b/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_random_word_count.py
@@ -0,0 +1,40 @@
+import testbase
+import unittest
+from random import choice
+from string import digits, ascii_lowercase
+from datetime import datetime
+
+
+def randomWordCount(sc, word_size, sample):
+    """
+    Generate a list of [sample] * random strings
+
+    """
+
+    chars = digits + ascii_lowercase
+    seq = ["".join([choice(chars) for i in range(word_size)]) for j in range(sample)]
+    data = sc.parallelize(seq)
+    counts = data.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).top(5)
+    dict(counts)
+
+class TestRandomWordCount(testbase.PySparkTestBase):
+
+    def test_word_count(self):
+        """
+        Test the word count method
+        Assert that the time taken doesn't exceed x seconds
+
+        """
+        #TODO: make this configurable
+        _CHANGE_ME_max_seconds = 5
+        _CHANGE_ME_sample = 10000
+
+        tick = datetime.now()
+        randomWordCount(self.sc, 3, _CHANGE_ME_sample)
+        tock = datetime.now()
+        diff = tock - tick
+        self.assertTrue(diff.seconds <=  _CHANGE_ME_max_seconds)
+
+if __name__ == '__main__':
+    if __name__ == '__main__':
+        unittest.main()
diff --git a/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_word_count.py b/experiments/stv/gdaf-openstack/tests/src/spark-tests/test_word_count.py
@@ -0,0 +1,29 @@
+import testbase
+import unittest
+
+def wordCount(rdd):
+    """
+    Counts the words in an RDD
+    """
+    wcntRdd = rdd.flatMap(lambda line: line.split()).\
+        map(lambda word: (word, 1)).\
+        reduceByKey(lambda fa, fb: fa + fb)
+    return wcntRdd
+
+
+class TestWordCount(testbase.ReusedPySparkTestCase):
+    def test_word_count(self):
+        """
+        Test the word count method
+        Assert that the word counts is correct
+        """
+        rdd = self.sc.parallelize(["a b c d", "a c d e", "a d e f"])
+        res = wordCount(rdd)
+        res = res.collectAsMap()
+        expected = {"a":3, "b":1, "c":2, "d":3, "e":2, "f":1}
+        self.assertEqual(res,expected)
+
+
+if __name__ == '__main__':
+    if __name__ == '__main__':
+        unittest.main()
diff --git a/experiments/stv/gdaf-openstack/tests/src/spark-tests/testbase.py b/experiments/stv/gdaf-openstack/tests/src/spark-tests/testbase.py
@@ -0,0 +1,41 @@
+import os
+import sys
+import unittest
+from datetime import datetime
+
+
+SPARK_HOME = os.environ["SPARK_HOME"]
+os.path.join(SPARK_HOME)
+sys.path.insert(1, os.path.join(SPARK_HOME, 'python'))
+sys.path.insert(1, os.path.join(SPARK_HOME, 'python', 'pyspark'))
+sys.path.insert(1, os.path.join(SPARK_HOME, 'python', 'build'))
+sys.path.insert(1, os.path.join(SPARK_HOME, 'python', 'lib/py4j-0.8.2.1-src.zip'))
+pyspark_python =  sys.executable
+os.environ['PYSPARK_PYTHON'] = pyspark_python
+
+from pyspark.conf import SparkConf
+from pyspark.context import SparkContext
+
+
+sc_values = {}
+
+class PySparkTestBase(unittest.TestCase):
+    """
+    Reusable PySpark Test Case Class
+    Share a Spark Context
+
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        conf = SparkConf().setMaster("local[2]") \
+            .setAppName(cls.__name__) \
+            .set("spark.authenticate.secret", "test")
+        cls.sc = SparkContext(conf=conf)
+        sc_values[cls.__name__] = cls.sc
+
+    @classmethod
+    def tearDownClass(cls):
+        sc_values.clear()
+        cls.sc.stop()
+
diff --git a/notes/stv/20200429-py-libs.txt b/notes/stv/20200429-py-libs.txt
@@ -0,0 +1,75 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#  
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+
+# Python libraries and Packages required for our Spark Cluster
+# ------------------------------------------------------------
+
+Name: 
+    Scikit-learn
+Version:
+    0.22.2.post1
+Install:
+    sudo yum install python-scikit-learn
+Link:
+    https://scikit-learn.org/stable/
+
+
+
+Name: 
+    PyDevel
+Version:
+    Latest
+Install:
+    sudo yum install -y python3-devel
+Link:
+    https://pkgs.org/download/python3-devel
+
+
+Name: 
+    Pandas
+Version:
+    1.0.3
+Link:
+    https://pandas.pydata.org/
+Install:
+    sudo /usr/bin/python2.7 -m pip install pandas
+
+
+Name:
+    PyArrow
+Version:
+    0.17.0
+Install:
+    sudo /usr/bin/python2.7 -m pip install pyarrow==0.16.*
+Link:
+    https://pypi.org/project/pyarrow/
+
+
+Name:
+    Hdbscan
+Version:
+    0.8.26      
+Install:
+    sudo /usr/bin/python2.7 -m pip install hdbscan
+Link:
+    https://hdbscan.readthedocs.io/en/latest/