Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Rabin–Karp algorithm #413

Merged
merged 8 commits into from
Oct 12, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions pydatastructs/strings/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
'find'
]

PRIME_NUMBER, MOD = 257, 1000000007

def find(text, query, algorithm):
"""
Finds occurrence of a query string within the text string.
Expand All @@ -22,6 +24,7 @@ def find(text, query, algorithm):
Currently the following algorithms are
supported,
'kmp' -> Knuth-Morris-Pratt as given in [1].
'rabin_karp' -> Rabin–Karp algorithm as given in [2].

Returns
=======
Expand Down Expand Up @@ -52,6 +55,7 @@ def find(text, query, algorithm):
==========

.. [1] https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm
.. [2] https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
"""
import pydatastructs.strings.algorithms as algorithms
func = "_" + algorithm
Expand All @@ -64,6 +68,8 @@ def find(text, query, algorithm):


def _knuth_morris_pratt(text, query):
if len(text) == 0 or len(query) == 0:
return DynamicOneDimensionalArray(int, 0)
kmp_table = _build_kmp_table(query)
return _do_match(text, query, kmp_table)

Expand Down Expand Up @@ -107,3 +113,39 @@ def _do_match(string, query, kmp_table):
k = k + 1

return positions

def _p_pow(length, p=PRIME_NUMBER, m=MOD):
p_pow = OneDimensionalArray(int, length)
p_pow[0] = 1
for i in range(1, length):
p_pow[i] = (p_pow[i-1] * p) % m
return p_pow

def _hash_str(string, p=PRIME_NUMBER, m=MOD):
hash_value = 0
p_pow = _p_pow(len(string), p, m)
for i in range(len(string)):
hash_value = (hash_value + ord(string[i]) * p_pow[i]) % m
return hash_value

def _rabin_karp(text, query):
t = len(text)
q = len(query)
positions = DynamicOneDimensionalArray(int, 0)
if q == 0 or t == 0:
return positions

query_hash = _hash_str(query)
text_hash = OneDimensionalArray(int, t + 1)
text_hash.fill(0)
p_pow = _p_pow(t)

for i in range(t):
text_hash[i+1] = (text_hash[i] + ord(text[i]) * p_pow[i]) % MOD
for i in range(t - q + 1):
curr_hash = (text_hash[i + q] + MOD - text_hash[i]) % MOD
if curr_hash == (query_hash * p_pow[i]) % MOD:
positions.append(i)

return positions

czgdp1807 marked this conversation as resolved.
Show resolved Hide resolved
10 changes: 7 additions & 3 deletions pydatastructs/strings/tests/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
def test_kmp():
_test_common_string_matching('kmp')

def test_rka():
_test_common_string_matching('rabin_karp')

def _test_common_string_matching(algorithm):
true_text_pattern_dictionary = {
Expand All @@ -26,7 +28,9 @@ def _test_common_string_matching(algorithm):
"Knuth-Morris-Pratt": "-Pratt-",
"abcabcabcabdabcabdabcabca": "qwertyuiopzxcvbnm",
"aefcdfaecdaefaefcdaefeaefcdcdeae": "cdaefaefe",
"fullstringmatch": "fullstrinmatch"
"fullstringmatch": "fullstrinmatch",
"abc": "",
"": "abc"
}

for test_case_key in false_text_pattern_dictionary:
Expand All @@ -52,13 +56,13 @@ def gen_random_string(length):
if rand_str != query:
freq += 1
text += query + rand_str + query
positions = find(text, query, algorithm="kmp")
positions = find(text, query, algorithm)
assert positions._num == num_times * 2
for i in range(positions._last_pos_filled):
p = positions[i]
assert text[p:p + len(query)] == query

text = gen_random_string(len(query))
if text != query:
positions = find(text, query, algorithm="kmp")
positions = find(text, query, algorithm)
assert positions.size == 0