Skip to content

Commit

Permalink
added saveAsTextFiles and saveAsPickledFiles
Browse files Browse the repository at this point in the history
  • Loading branch information
giwa committed Aug 18, 2014
1 parent 94f2b65 commit e9fab72
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 14 deletions.
17 changes: 8 additions & 9 deletions python/pyspark/streaming/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def textFileStream(self, directory):
Create an input stream that monitors a Hadoop-compatible file system
for new files and reads them as text files. Files must be wrriten to the
monitored directory by "moving" them from another location within the same
file system. FIle names starting with . are ignored.
file system. File names starting with . are ignored.
"""
return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())

Expand All @@ -132,8 +132,9 @@ def stop(self, stopSparkContext=True, stopGraceFully=False):

def _testInputStream(self, test_inputs, numSlices=None):
"""
This is inpired by QueStream implementation. Give list of RDD and generate DStream
which contain the RDD.
This function is only for test.
This implementation is inpired by QueStream implementation.
Give list of RDD to generate DStream which contains the RDD.
"""
test_rdds = list()
test_rdd_deserializers = list()
Expand All @@ -142,12 +143,10 @@ def _testInputStream(self, test_inputs, numSlices=None):
test_rdds.append(test_rdd._jrdd)
test_rdd_deserializers.append(test_rdd._jrdd_deserializer)

# if len(set(test_rdd_deserializers)) > 1:
# raise IOError("Deserializer should be one type to run test case. "
# "See the SparkContext.parallelize to understand how to decide deserializer")
jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()

dstream = DStream(jinput_stream, self, test_rdd_deserializers[0])
return dstream

def _testInputStream3(self):
jinput_stream = self._jvm.PythonTestInputStream3(self._jssc).asJavaDStream()
return DStream(jinput_stream, self, UTF8Deserializer())
return DStream(jinput_stream, self, test_rdd_deserializers[0])
35 changes: 31 additions & 4 deletions python/pyspark/streaming/dstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from pyspark.rdd import _JavaStackTrace
from pyspark.storagelevel import StorageLevel
from pyspark.resultiterable import ResultIterable
from pyspark.streaming.utils import rddToFileName


from py4j.java_collections import ListConverter, MapConverter

Expand Down Expand Up @@ -343,21 +345,46 @@ def mergeCombiners(a, b):
return self.combineByKey(createCombiner, mergeValue, mergeCombiners,
numPartitions).mapValues(lambda x: ResultIterable(x))

def countByValue(self):
def countPartition(iterator):
counts = defaultdict(int)
for obj in iterator:
counts[obj] += 1
yield counts

def mergeMaps(m1, m2):
for (k, v) in m2.iteritems():
m1[k] += v
return m1

return self.mapPartitions(countPartition).reduce(mergeMaps).flatMap(lambda x: x.items())

def saveAsTextFiles(self, prefix, suffix=None):

def saveAsTextFile(rdd, time):
path = rddToFileName(prefix, suffix, time)
rdd.saveAsTextFile(path)

return self.foreachRDD(saveAsTextFile)

def saveAsPickledFiles(self, prefix, suffix=None):

def saveAsTextFile(rdd, time):
path = rddToFileName(prefix, suffix, time)
rdd.saveAsPickleFile(path)

return self.foreachRDD(saveAsTextFile)

# TODO: implement groupByKey
# TODO: implement saveAsTextFile

# Following operation has dependency to transform
# TODO: impelment union
# TODO: implement repertitions
# TODO: implement cogroup
# TODO: implement join
# TODO: implement countByValue
# TODO: implement leftOuterJoin
# TODO: implemtnt rightOuterJoin



class PipelinedDStream(DStream):
def __init__(self, prev, func, preservesPartitioning=False):
if not isinstance(prev, PipelinedDStream) or not prev._is_pipelinable():
Expand Down
6 changes: 6 additions & 0 deletions python/pyspark/streaming/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,9 @@ def msDurationToString(ms):
return "%.1f m" % (float(ms) / minute)
else:
return "%.2f h" % (float(ms) / hour)

def rddToFileName(prefix, suffix, time):
if suffix is not None:
return prefix + "-" + str(time) + "." + suffix
else:
return prefix + "-" + str(time)
32 changes: 32 additions & 0 deletions python/pyspark/streaming_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,38 @@ def f(iterator):
output = self._run_stream(test_input, test_func, expected_output, numSlices)
self.assertEqual(expected_output, output)

def test_countByValue_batch(self):
"""Basic operation test for DStream.countByValue with batch deserializer"""
test_input = [range(1, 5) + range(1,5), range(5, 7) + range(5, 9), ["a"] * 2 + ["b"] + [""] ]

def test_func(dstream):
return dstream.countByValue()
expected_output = [[(1, 2), (2, 2), (3, 2), (4, 2)],
[(5, 2), (6, 2), (7, 1), (8, 1)],
[("a", 2), ("b", 1), ("", 1)]]
output = self._run_stream(test_input, test_func, expected_output)
for result in (output, expected_output):
self._sort_result_based_on_key(result)
self.assertEqual(expected_output, output)

def test_countByValue_unbatch(self):
"""Basic operation test for DStream.countByValue with unbatch deserializer"""
test_input = [range(1, 4), [1, 1, ""], ["a", "a", "b"]]

def test_func(dstream):
return dstream.countByValue()
expected_output = [[(1, 1), (2, 1), (3, 1)],
[(1, 2), ("", 1)],
[("a", 2), ("b", 1)]]
output = self._run_stream(test_input, test_func, expected_output)
for result in (output, expected_output):
self._sort_result_based_on_key(result)
self.assertEqual(expected_output, output)

def _sort_result_based_on_key(self, outputs):
for output in outputs:
output.sort(key=lambda x: x[0])

def _run_stream(self, test_input, test_func, expected_output, numSlices=None):
"""Start stream and return the output"""
# Generate input stream with user-defined input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class PythonTransformedDStream(
* This is a input stream just for the unitest. This is equivalent to a checkpointable,
* replayable, reliable message queue like Kafka. It requires a sequence as input, and
* returns the i_th element at the i_th batch under manual clock.
* This implementation is close to QueStream
* This implementation is inspired by QueStream
*/

class PythonTestInputStream(ssc_ : JavaStreamingContext, inputRDDs: JArrayList[JavaRDD[Array[Byte]]])
Expand Down

0 comments on commit e9fab72

Please sign in to comment.