Skip to content
This repository has been archived by the owner on Jul 16, 2024. It is now read-only.

Commit

Permalink
Merge pull request #18 from target/dev
Browse files Browse the repository at this point in the history
Merge dev to master
  • Loading branch information
DavidJBianco authored Oct 12, 2020
2 parents f85312d + b375a21 commit 04f48cb
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 5 deletions.
35 changes: 33 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ The `huntlib` module provides three major object classes as well as a few conven
* **data.read_json()**: Read one or more JSON files and return a single Pandas DataFrame
* **data.read_csv()**: Read one or more CSV files and return a single Pandas DataFrame
* **data.flatten()**: Recursively flatten dicts/lists into a single level dict. Useful for data normalization and creating DataFrames.
* **data.chunk()**: Break up a large list into smaller chunks for processing.
* **util.entropy()** / **util.entropy_per_byte()**: Calculate Shannon entropy
* **util.promptCreds()**: Prompt for login credentials in the terminal or from within a Jupyter notebook.
* **util.edit_distance()**: Calculate how "different" two strings are from each other
* **util.benfords()**: Determine whether a given collection of numbers obeys Benford's Law
* **util.edit_distance()**: Calculate how "different" two strings are from each other.
* **util.benfords()**: Determine whether a given collection of numbers obeys Benford's Law.
* **util.punctuation_pattern()**: Return only the non-alphanumeric characters from a given string or collection of strings.

## Library-Wide Configuration
Beginning with `v0.5.0`, `huntlib` now provides a library-wide configuration file, `~/.huntlibrc` allowing you to set certain runtime defaults. Consult the file `huntlibrc-sample` in this repo for more information.
Expand Down Expand Up @@ -384,6 +386,19 @@ A more complex example:
>>> flatten([{'a': 'a', 'b': 'b'}, {'a': 'a1', 'c': 'c'}])
{'0.a': 'a', '0.b': 'b', '1.a': 'a1', '1.c': 'c'}

### Breaking a long list-like object into smaller chunks
Given a list-like object, divide into chunks of `size` and return those as a generator. If the length of the sequence is not evenly divisible by the size, the final chunk will contain however many items remain.

>>> l = list(range(26))
>>> for i in chunk(l, size=5):
... print(i)
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25]

## Util Module

The `huntlib.util` modules contains miscellaneous functions that don't fit anywhere else, but are nevertheless still useful.
Expand Down Expand Up @@ -499,3 +514,19 @@ Here the input is a set of random numbers, which do not conform to Benford's Law
9 0.121
Name: digits, dtype: float64)
```

### Generate punctuation patterns from strings

For certain types of log analysis, the contents of the individual log messages is not as important as the format of the message itself. This often results in the need to examine the pattern of punctuation (non-alphanumeric characters). To facilitate this, the `punctuation_pattern()` function accepts a single string or a list-like collection of strings and returns *just* the non-alphanumeric characters.

```python
>>> >>> s = '192.168.1.1 - - [10/Oct/2020:12:32:27 +0000] "GET /some/web/app?param=test&param2=another_test" 200 9987'
>>> punctuation_pattern(s)
'..._-_-_[//:::_+]_"_///?=&=_"__'

>>> l = [s, "Another example. This time, of a list of strings!"]
>>> punctuation_pattern(l)
0 ..._-_-_[//:::_+]_"_///?=&=_"__
1 _.__,_____!
Name: punct, dtype: object
```
15 changes: 15 additions & 0 deletions huntlib/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,18 @@ def _flatten(obj, keypath='', sep='.'):
yield keypath, obj

return dict(_flatten(obj=obj, sep=sep))

def chunk(sequence, size=10):
'''
Given a sequence-like object, divide into chunks of `size`
and return those as a generator. If the length of the sequence
is not evenly divisible by the size, the final chunk will
contain however many items remain.
:param sequence: The sequence to chunk up
:param size: The chunk size
'''

for chunk in [sequence[i:i+size] for i in range(0, len(sequence), size)]:
yield chunk

51 changes: 51 additions & 0 deletions huntlib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
import platform
import multiprocessing
import re

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -192,3 +193,53 @@ def _first_digit(i: float):
system_type = platform.system()
if system_type == "Darwin":
multiprocessing.set_start_method('fork')

def punctuation_pattern(strings, escape_quotes=False):
'''
Return only the non-alphanumeric characters in the input string(s).
White spaces in the input will be translated into underscore characters
in the output.
:param strings: The input string(s) to process.
:type strings: A single string or a list-like object of strings (e.g. a pandas Series)
:Return Value:
If the input is a single string, the output will also be a single string. Otherwise
the output will be a pandas Series of results in the same order as the input strings.
'''

def _get_punct_pattern(s: str) -> str:

res = re.sub(
'\s',
'_',
re.sub(
'[a-zA-Z0-9]',
'',
s
)
)

if escape_quotes:
res = re.sub(
'([\'\"])',
r'\\\1',
res
)

return res

if isinstance(strings, str):
strings = [strings]
elif not is_list_like(strings):
raise TypeError(f'The argument must be a string or list-like of strings, not type {type(strings)}.')

strings = pd.DataFrame(strings, columns=['strings'])
strings['punct'] = strings['strings'].apply(_get_punct_pattern)

res = strings['punct']

# If there's only one result, it was because we passed a single string,
# just return a single result. Otherwise return all results.
return res[0] if res.shape[0] == 1 else res

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
long_description = fh.read()

setup(name='huntlib',
version='0.5.0',
version='0.5.1',
description='A Python library to help with some common threat hunting data analysis operations',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
25 changes: 25 additions & 0 deletions tests/test_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python

from huntlib.data import chunk
import numpy as np

from unittest import TestCase

class TestChunk(TestCase):
def test_chunk(self):
l = list(np.random.randint(10, size=100))

res = list(chunk(l))

# There should be 10 equal chunks with the default chunk size
self.assertEqual(len(res), 10)

# All chunks should be size 10
self.assertEqual(len(res[0]), 10)

# Rechunk with an odd size that won't result in equal chunks
res = list(chunk(l, size=9))

self.assertEqual(len(res), 12)
self.assertEqual(len(res[0]), 9)
self.assertEqual(len(res[-1]), 1)
4 changes: 2 additions & 2 deletions tests/test_domaintools.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ def test_enrich(self):

enriched_df = self._handle.enrich(df, column='domain')

self.assertEqual(
self.assertGreater(
enriched_df.shape[1],
131,
10,
"Enriched DataFrame does not have the correct number of columns."
)

Expand Down
54 changes: 54 additions & 0 deletions tests/test_punctuation_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python

from huntlib.util import punctuation_pattern
import pandas as pd
import numpy as np

from unittest import TestCase

class TestPunctuationPattern(TestCase):
_event_message_1 = '192.168.1.1 - - [10/Oct/2020:12:32:27 +0000] "GET /some/web/app?param=test&param2=another_test" 200 9987'
_event_message_2 = 'ERROR: Can\'t resolve "nosuchhost": No such host.'
_punct_pattern_1 = '..._-_-_[//:::_+]_\"_///?=&=_\"__'
_punct_pattern_2 = ':_\'__\"\":___.'

_expected_pattern_result = pd.Series([_punct_pattern_1, _punct_pattern_2])

def test_punctuation_pattern_single_string(self):
res = punctuation_pattern(self._event_message_1)

self.assertEqual(res, self._punct_pattern_1)

def test_punctuation_pattern_list(self):
res = punctuation_pattern(
[
self._event_message_1,
self._event_message_2
]
)

self.assertListEqual(list(res), list(self._expected_pattern_result))

def test_punctuation_pattern_series(self):
res = punctuation_pattern(
pd.Series(
[
self._event_message_1,
self._event_message_2
]
)
)

self.assertListEqual(list(res), list(self._expected_pattern_result))

def test_punctuation_pattern_escape_quotes(self):
res = punctuation_pattern("\'\"")
self.assertEqual(res, '\'\"')

res = punctuation_pattern("\'\"", escape_quotes=False)
self.assertEqual(res, '\'\"')

res = punctuation_pattern("\'\"", escape_quotes=True)
self.assertEqual(res, '\\\'\\\"')


0 comments on commit 04f48cb

Please sign in to comment.