From c8a40e321c8ca1ded2274b51b18c2d5e05291148 Mon Sep 17 00:00:00 2001 From: Ayman Lafaz Date: Wed, 6 Oct 2021 13:02:27 +0100 Subject: [PATCH] added approximate counting implementation in python (#866) * added approximate counting implementation in python * Update approximate_counting.py * updating approximate_counting.py * removed redundancies --- .../approximate_counting.md | 12 +++-- .../code/python/approximate_counting.py | 49 +++++++++++++++++++ 2 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 contents/approximate_counting/code/python/approximate_counting.py diff --git a/contents/approximate_counting/approximate_counting.md b/contents/approximate_counting/approximate_counting.md index 917e79922..f63d7db43 100644 --- a/contents/approximate_counting/approximate_counting.md +++ b/contents/approximate_counting/approximate_counting.md @@ -127,7 +127,7 @@ Here is a table for the true count, approximate count, and percent error for 10, | 500,000 | 499,813.2 | 0.037 | | 1,000,000 | 999,466.0 | 0.053 | -Here, it seems that the percent error is 10 times higher when we count 10,000 items; however, +Here, it seems that the percent error is 10 times higher when we count 10,000 items; however, with these numbers, I could imagine some people reading this are thinking that we are splitting hairs. A 0.42% error is still really good, right? Right. @@ -200,7 +200,7 @@ To be clear, here is a table of several values that could be stored in a bitstri | $$00000100 = 4$$ | $$15$$ | | $$00010000 = 16$$ | $$65535$$ | | $$01000000 = 64$$ | $$1.85 \times 10^{19}$$ | -| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ | +| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ | | $$11111111 = 255$$ | $$5.79 \times 10^{76}$$ | This means that we can hold from $$0$$ to $$2^{255} - 1 \approx 5.79 \times 10^{76}$$ with 8 bits using this new method. @@ -250,7 +250,7 @@ In the next section, we will consider how to generalize this logarithmic method ## A slightly more general logarithm Let's start by considering the differences between base $$2$$ and base $$e$$. -For base $$e$$, +For base $$e$$, $$ \begin{align} @@ -283,14 +283,14 @@ Going one step further, we need to chose a specific base to a logarithm that wil $$ \begin{align} - v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\ + v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\ n_v &= a\left(\left(1+\frac{1}{a}\right)^v-1\right). \end{align} $$ Here, $$a$$ is an effective tuning parameter and sets the maximum count allowed by the bitstring and the expected error. The expression $$1+1/a$$ acts as a base for the logarithm and exponents and ensures that the first count of $$n=1$$ will also set the value $$v=1$$. -As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set +As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set $$a=30$$, then the highest possible count with this approach will be $$\approx 130,000$$, which was the number reported in Morris's paper. If we perform a few counting experiments, we find that this formula more closely tracks smaller numbers than before (when we were not using the logarithm): @@ -362,6 +362,8 @@ As we do not have any objects to count, we will instead simulate the counting wi [import, lang:"julia"](code/julia/approximate_counting.jl) {% sample lang="cpp" %} [import, lang:"cpp"](code/c++/approximate_counting.cpp) +{% sample lang="python" %} +[import, lang:"python"](code/python/approximate_counting.py) {% endmethod %} ### Bibliography diff --git a/contents/approximate_counting/code/python/approximate_counting.py b/contents/approximate_counting/code/python/approximate_counting.py new file mode 100644 index 000000000..eb31b2b24 --- /dev/null +++ b/contents/approximate_counting/code/python/approximate_counting.py @@ -0,0 +1,49 @@ +from random import random + +# This function takes +# - v: value in register +# - a: a scaling value for the logarithm based on Morris's paper +# It returns n(v,a), the approximate_count +def n(v, a): + return a*((1 + 1/a)**v - 1) + +# This function takes +# - v: value in register +# - a: a scaling value for the logarithm based on Morris's paper +# It returns a new value for v +def increment(v, a): + delta = 1/(n(v + 1, a) - n(v, a)) + if random() <= delta: + return v + 1 + else: + return v + +#This simulates counting and takes +# - n_items: number of items to count and loop over +# - a: a scaling value for the logarithm based on Morris's paper +# It returns n(v,a), the approximate count +def approximate_count(n_items, a): + v = 0 + for i in range(1, n_items + 1): + v = increment(v, a) + return n(v, a) + +# This function takes +# - n_trials: the number of counting trials +# - n_items: the number of items to count to +# - a: a scaling value for the logarithm based on Morris's paper +# - threshold: the maximum percent error allowed +# It returns a true / false test value +def test_approximate_count(n_trials, n_items, a, threshold): + samples = [approximate_count(n_items, a) for i in range(1, n_trials + 1)] + avg = sum(samples)/n_trials + + if abs((avg - n_items)/n_items) < threshold: + print("passed") + +print("testing 1,000, a = 30, 1% error") +test_approximate_count(100, 1000, 30, 0.1) +print("testing 12,345, a = 10, 1% error") +test_approximate_count(100, 12345, 10, 0.1) +print("testing 222,222, a = 0.5, 10% error") +test_approximate_count(100, 222222, 0.5, 0.2)