Merge pull request #18 from target/dev

Merge dev to master
target · Oct 12, 2020 · 04f48cb · 04f48cb
2 parents f85312d + b375a21
commit 04f48cb
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -12,10 +12,12 @@ The `huntlib` module provides three major object classes as well as a few conven
 * **data.read_json()**: Read one or more JSON files and return a single Pandas DataFrame
 * **data.read_csv()**: Read one or more CSV files and return a single Pandas DataFrame
 * **data.flatten()**: Recursively flatten dicts/lists into a single level dict. Useful for data normalization and creating DataFrames.
+* **data.chunk()**: Break up a large list into smaller chunks for processing.
 * **util.entropy()** / **util.entropy_per_byte()**: Calculate Shannon entropy
 * **util.promptCreds()**: Prompt for login credentials in the terminal or from within a Jupyter notebook.
-* **util.edit_distance()**: Calculate how "different" two strings are from each other
-* **util.benfords()**: Determine whether a given collection of numbers obeys Benford's Law
+* **util.edit_distance()**: Calculate how "different" two strings are from each other.
+* **util.benfords()**: Determine whether a given collection of numbers obeys Benford's Law.
+* **util.punctuation_pattern()**: Return only the non-alphanumeric characters from a given string or collection of strings.
 
 ## Library-Wide Configuration
 Beginning with `v0.5.0`, `huntlib` now provides a library-wide configuration file, `~/.huntlibrc` allowing you to set certain runtime defaults.  Consult the file `huntlibrc-sample` in this repo for more information.
@@ -384,6 +386,19 @@ A more complex example:
     >>> flatten([{'a': 'a', 'b': 'b'}, {'a': 'a1', 'c': 'c'}])
     {'0.a': 'a', '0.b': 'b', '1.a': 'a1', '1.c': 'c'}
 
+### Breaking a long list-like object into smaller chunks
+Given a list-like object, divide into chunks of `size` and return those as a generator. If the length of the sequence is not evenly divisible by the size, the final chunk will contain however many items remain.
+
+    >>> l = list(range(26))
+    >>> for i in chunk(l, size=5):
+    ...   print(i)
+    [0, 1, 2, 3, 4]
+    [5, 6, 7, 8, 9] 
+    [10, 11, 12, 13, 14]
+    [15, 16, 17, 18, 19]
+    [20, 21, 22, 23, 24]
+    [25]
+
 ## Util Module 
 
 The `huntlib.util` modules contains miscellaneous functions that don't fit anywhere else, but are nevertheless still useful.
@@ -499,3 +514,19 @@ Here the input is a set of random numbers, which do not conform to Benford's Law
 9    0.121
 Name: digits, dtype: float64)
 ```
+
+### Generate punctuation patterns from strings
+
+For certain types of log analysis, the contents of the individual log messages is not as important as the format of the message itself. This often results in the need to examine the pattern of punctuation (non-alphanumeric characters). To facilitate this, the `punctuation_pattern()` function accepts a single string or a list-like collection of strings and returns *just* the non-alphanumeric characters.
+
+```python
+>>> >>> s = '192.168.1.1 - - [10/Oct/2020:12:32:27 +0000] "GET /some/web/app?param=test&param2=another_test" 200 9987'
+>>> punctuation_pattern(s)
+'..._-_-_[//:::_+]_"_///?=&=_"__'
+
+>>> l = [s, "Another example. This time, of a list of strings!"]
+>>> punctuation_pattern(l)
+0    ..._-_-_[//:::_+]_"_///?=&=_"__
+1                        _.__,_____!
+Name: punct, dtype: object
+```
diff --git a/huntlib/data.py b/huntlib/data.py
@@ -113,3 +113,18 @@ def _flatten(obj, keypath='', sep='.'):
             yield keypath, obj
 
     return dict(_flatten(obj=obj, sep=sep))
+
+def chunk(sequence, size=10):
+    '''
+    Given a sequence-like object, divide into chunks of `size`
+    and return those as a generator. If the length of the sequence
+    is not evenly divisible by the size, the final chunk will 
+    contain however many items remain.
+
+    :param sequence: The sequence to chunk up
+    :param size: The chunk size 
+    '''
+
+    for chunk in [sequence[i:i+size] for i in range(0, len(sequence), size)]:
+        yield chunk
+
diff --git a/huntlib/util.py b/huntlib/util.py
@@ -10,6 +10,7 @@
 import sys
 import platform
 import multiprocessing
+import re
 
 import pandas as pd
 import numpy as np
@@ -192,3 +193,53 @@ def _first_digit(i: float):
 system_type = platform.system()
 if system_type == "Darwin":
     multiprocessing.set_start_method('fork')
+
+def punctuation_pattern(strings, escape_quotes=False):
+    '''
+    Return only the non-alphanumeric characters in the input string(s).  
+    White spaces in the input will be translated into underscore characters
+    in the output. 
+
+    :param strings: The input string(s) to process.
+    :type strings: A single string or a list-like object of strings (e.g. a pandas Series)
+
+    :Return Value:
+    If the input is a single string, the output will also be a single string.  Otherwise
+    the output will be a pandas Series of results in the same order as the input strings.
+    '''
+
+    def _get_punct_pattern(s: str) -> str:
+
+        res = re.sub(
+            '\s',
+            '_',
+            re.sub(
+                '[a-zA-Z0-9]',
+                '',
+                s
+            )
+        )
+
+        if escape_quotes:
+            res = re.sub(
+                '([\'\"])',
+                r'\\\1',
+                res
+            )
+
+        return res 
+
+    if isinstance(strings, str):
+        strings = [strings]
+    elif not is_list_like(strings):
+        raise TypeError(f'The argument must be a string or list-like of strings, not type {type(strings)}.')
+
+    strings = pd.DataFrame(strings, columns=['strings'])
+    strings['punct'] = strings['strings'].apply(_get_punct_pattern)
+
+    res = strings['punct']
+
+    # If there's only one result, it was because we passed a single string,
+    # just return a single result. Otherwise return all results.
+    return res[0] if res.shape[0] == 1 else res 
+
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
     long_description = fh.read()
 
 setup(name='huntlib',
-      version='0.5.0',
+      version='0.5.1',
       description='A Python library to help with some common threat hunting data analysis operations',
       long_description=long_description,
       long_description_content_type="text/markdown",

diff --git a/tests/test_chunk.py b/tests/test_chunk.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+from huntlib.data import chunk
+import numpy as np
+
+from unittest import TestCase
+
+class TestChunk(TestCase):
+    def test_chunk(self):
+        l = list(np.random.randint(10, size=100))
+
+        res = list(chunk(l))
+
+        # There should be 10 equal chunks with the default chunk size
+        self.assertEqual(len(res), 10)
+
+        # All chunks should be size 10
+        self.assertEqual(len(res[0]), 10)
+
+        # Rechunk with an odd size that won't result in equal chunks
+        res = list(chunk(l, size=9))
+
+        self.assertEqual(len(res), 12)
+        self.assertEqual(len(res[0]), 9)
+        self.assertEqual(len(res[-1]), 1)
diff --git a/tests/test_domaintools.py b/tests/test_domaintools.py
@@ -176,9 +176,9 @@ def test_enrich(self):
 
         enriched_df = self._handle.enrich(df, column='domain')
 
-        self.assertEqual(
+        self.assertGreater(
             enriched_df.shape[1],
-            131,
+            10,
             "Enriched DataFrame does not have the correct number of columns."
         )
 

diff --git a/tests/test_punctuation_pattern.py b/tests/test_punctuation_pattern.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+from huntlib.util import punctuation_pattern
+import pandas as pd
+import numpy as np 
+
+from unittest import TestCase
+
+class TestPunctuationPattern(TestCase):
+    _event_message_1 = '192.168.1.1 - - [10/Oct/2020:12:32:27 +0000] "GET /some/web/app?param=test&param2=another_test" 200 9987'
+    _event_message_2 = 'ERROR: Can\'t resolve "nosuchhost": No such host.'
+    _punct_pattern_1 = '..._-_-_[//:::_+]_\"_///?=&=_\"__'
+    _punct_pattern_2 = ':_\'__\"\":___.'
+
+    _expected_pattern_result = pd.Series([_punct_pattern_1, _punct_pattern_2])
+
+    def test_punctuation_pattern_single_string(self):
+        res = punctuation_pattern(self._event_message_1)
+
+        self.assertEqual(res, self._punct_pattern_1) 
+
+    def test_punctuation_pattern_list(self):
+        res = punctuation_pattern(
+            [
+                self._event_message_1,
+                self._event_message_2
+            ]
+        )
+
+        self.assertListEqual(list(res), list(self._expected_pattern_result))
+
+    def test_punctuation_pattern_series(self):
+        res = punctuation_pattern(
+            pd.Series(
+                [
+                    self._event_message_1,
+                    self._event_message_2
+                ]
+            )
+        )
+
+        self.assertListEqual(list(res), list(self._expected_pattern_result))
+
+    def test_punctuation_pattern_escape_quotes(self):
+        res = punctuation_pattern("\'\"")
+        self.assertEqual(res, '\'\"')
+
+        res = punctuation_pattern("\'\"", escape_quotes=False)
+        self.assertEqual(res, '\'\"')
+
+        res = punctuation_pattern("\'\"", escape_quotes=True)
+        self.assertEqual(res, '\\\'\\\"')
+
+