Skip to content

Commit

Permalink
added correct doctest for histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
dwmclary committed Mar 11, 2014
1 parent 4916016 commit eaf89d9
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,14 +613,6 @@ def sampleVariance(self):
return self.stats().sampleVariance()

def getBuckets(self, bucketCount):
"""
Compute a histogram of the data using bucketCount number of buckets
evenly spaced between the min and max of the RDD.
>>> sc.parallelize([1,49, 23, 100, 75, 50]).histogram()
{(0,49):3, (50, 100):3}
"""

#use the statscounter as a quick way of getting max and min
mm_stats = self.stats()
min = mm_stats.min()
Expand All @@ -634,6 +626,14 @@ def getBuckets(self, bucketCount):
return buckets

def histogram(self, bucketCount, buckets=None):
"""
Compute a histogram of the data using bucketCount number of buckets
evenly spaced between the min and max of the RDD.
>>> sc.parallelize([1,49, 23, 100, 12, 13, 20, 22, 75, 50]).histogram(3)
defaultdict(<type 'int'>, {(67, inf): 2, (1, 33): 6, (34, 66): 2})
"""

evenBuckets = False
if not buckets:
buckets = self.getBuckets(bucketCount)
Expand Down

0 comments on commit eaf89d9

Please sign in to comment.