From c8a40e321c8ca1ded2274b51b18c2d5e05291148 Mon Sep 17 00:00:00 2001
From: Ayman Lafaz <ayman.lafaz@um5r.ac.ma>
Date: Wed, 6 Oct 2021 13:02:27 +0100
Subject: [PATCH] added approximate counting implementation in python (#866)

* added approximate counting implementation in python

* Update approximate_counting.py

* updating approximate_counting.py

* removed redundancies
---
 .../approximate_counting.md                   | 12 +++--
 .../code/python/approximate_counting.py       | 49 +++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 contents/approximate_counting/code/python/approximate_counting.py

diff --git a/contents/approximate_counting/approximate_counting.md b/contents/approximate_counting/approximate_counting.md
index 917e79922..f63d7db43 100644
--- a/contents/approximate_counting/approximate_counting.md
+++ b/contents/approximate_counting/approximate_counting.md
@@ -127,7 +127,7 @@ Here is a table for the true count, approximate count, and percent error for 10,
 | 500,000    | 499,813.2         | 0.037         |
 | 1,000,000  | 999,466.0         | 0.053         |
 
-Here, it seems that the percent error is 10 times higher when we count 10,000 items; however, 
+Here, it seems that the percent error is 10 times higher when we count 10,000 items; however,
 with these numbers, I could imagine some people reading this are thinking that we are splitting hairs.
 A 0.42% error is still really good, right?
 Right.
@@ -200,7 +200,7 @@ To be clear, here is a table of several values that could be stored in a bitstri
 | $$00000100 = 4$$   | $$15$$                  |
 | $$00010000 = 16$$  | $$65535$$               |
 | $$01000000 = 64$$  | $$1.85 \times 10^{19}$$ |
-| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ | 
+| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ |
 | $$11111111 = 255$$ | $$5.79 \times 10^{76}$$ |
 
 This means that we can hold from $$0$$ to $$2^{255} - 1 \approx 5.79 \times 10^{76}$$ with 8 bits using this new method.
@@ -250,7 +250,7 @@ In the next section, we will consider how to generalize this logarithmic method
 ## A slightly more general logarithm
 
 Let's start by considering the differences between base $$2$$ and base $$e$$.
-For base $$e$$, 
+For base $$e$$,
 
 $$
 \begin{align}
@@ -283,14 +283,14 @@ Going one step further, we need to chose a specific base to a logarithm that wil
 
 $$
 \begin{align}
-    v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\ 
+    v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\
     n_v &= a\left(\left(1+\frac{1}{a}\right)^v-1\right).
 \end{align}
 $$
 
 Here, $$a$$ is an effective tuning parameter and sets the maximum count allowed by the bitstring and the expected error.
 The expression $$1+1/a$$ acts as a base for the logarithm and exponents and ensures that the first count of $$n=1$$ will also set the value $$v=1$$.
-As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set 
+As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set
 $$a=30$$, then the highest possible count with this approach will be $$\approx 130,000$$, which was the number reported in Morris's paper.
 If we perform a few counting experiments, we find that this formula more closely tracks smaller numbers than before (when we were not using the logarithm):
 
@@ -362,6 +362,8 @@ As we do not have any objects to count, we will instead simulate the counting wi
 [import, lang:"julia"](code/julia/approximate_counting.jl)
 {% sample lang="cpp" %}
 [import, lang:"cpp"](code/c++/approximate_counting.cpp)
+{% sample lang="python" %}
+[import, lang:"python"](code/python/approximate_counting.py)
 {% endmethod %}
 
 ### Bibliography
diff --git a/contents/approximate_counting/code/python/approximate_counting.py b/contents/approximate_counting/code/python/approximate_counting.py
new file mode 100644
index 000000000..eb31b2b24
--- /dev/null
+++ b/contents/approximate_counting/code/python/approximate_counting.py
@@ -0,0 +1,49 @@
+from random import random
+
+# This function takes
+#   - v: value in register
+#   - a: a scaling value for the logarithm based on Morris's paper
+# It returns n(v,a), the approximate_count
+def n(v, a):
+    return a*((1 + 1/a)**v - 1)
+
+# This function takes
+#    - v: value in register
+#    - a: a scaling value for the logarithm based on Morris's paper
+# It returns a new value for v
+def increment(v, a):
+    delta = 1/(n(v + 1, a) - n(v, a))
+    if random() <= delta:
+        return v + 1
+    else:
+        return v
+
+#This simulates counting and takes
+#     - n_items: number of items to count and loop over
+#     - a: a scaling value for the logarithm based on Morris's paper
+# It returns n(v,a), the approximate count
+def approximate_count(n_items, a):
+    v = 0
+    for i in range(1, n_items + 1):
+        v = increment(v, a)
+    return n(v, a)
+
+# This function takes
+#     - n_trials: the number of counting trials
+#     - n_items: the number of items to count to
+#     - a: a scaling value for the logarithm based on Morris's paper
+#     - threshold: the maximum percent error allowed
+# It returns a true / false test value
+def test_approximate_count(n_trials, n_items, a, threshold):
+    samples = [approximate_count(n_items, a) for i in range(1, n_trials + 1)]
+    avg = sum(samples)/n_trials
+
+    if abs((avg - n_items)/n_items) < threshold:
+        print("passed")
+
+print("testing 1,000, a = 30, 1% error")
+test_approximate_count(100, 1000, 30, 0.1)
+print("testing 12,345, a = 10, 1% error")
+test_approximate_count(100, 12345, 10, 0.1)
+print("testing 222,222, a = 0.5, 10% error")
+test_approximate_count(100, 222222, 0.5, 0.2)