generated from databricks-industry-solutions/industry-solutions-blueprints
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathanalytics_02_investigation.py
281 lines (246 loc) · 9.25 KB
/
analytics_02_investigation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Databricks notebook source
# MAGIC %md
# MAGIC
# MAGIC # Context Graphs for Investigation Workflows
# MAGIC
# MAGIC ## Use cases
# MAGIC
# MAGIC * An enterprise might have multiple IAM/SSO, AD domains, authn systems including local unix authn. An analyst investigating an incident will need to manually resolve these identities. Context graphs provide `same_as` edges that capture the results from any entity resolution package and these edges can be traversed like any other relationships.
# MAGIC * Impact analysis (blast radius)
# MAGIC * Given a compromised user account, what apps, ip addresses etc. has this account touched across the enterprise?
# MAGIC * Given a compromised app or ip address, who are the users being affected?
# MAGIC * Investigate lateral movement as the attacker steals credentials, escalate privileges and move from one system to another (https://www.crowdstrike.com/cybersecurity-101/lateral-movement/)
# MAGIC * Investigate insider threat by exploring relationships with HR data sources
# COMMAND ----------
dbutils.widgets.removeAll()
#dbutils.widgets.text("travel_node", "", "travel along node (id or name): ")
#travel_node = dbutils.widgets.get("travel_node")
# COMMAND ----------
# MAGIC %pip install bokeh
# COMMAND ----------
import networkx as nx
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, HoverTool
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.embed import file_html
from bokeh.resources import CDN
def get_node_color(node_type):
if node_type[:4] == "user":
return "dimgray"
if node_type == "suricata-alert":
return "red"
if node_type == "ipAddress":
return "#F39C12"
if node_type == "hash":
return "#F9E79F"
if node_type == "fqdn":
return "#F1C40F"
if node_type == "app":
return "dodgerblue"
return "lightblue"
def stringify(col):
if col is None:
return ""
return str(col)
def add_edges_from_df(G, edges_df):
n=0
for (src, time_bkt, sub_type, sub_id, sub_name, pred, pred_status, obj_type, obj_id, obj_name, first_seen, last_seen, cnt) in edges_df.collect():
nodes = [(sub_id, {"type": sub_type, "name": sub_name, "color": get_node_color(sub_type)}),
(obj_id, {"type": obj_type, "name": obj_name, "color": get_node_color(obj_type)})]
G.add_nodes_from(nodes)
edges = [(sub_id, obj_id, {"name": pred, "status": stringify(pred_status), "cnt": stringify(cnt), "firstseen": stringify(first_seen), "lastseen": stringify(last_seen)})]
G.add_edges_from(edges)
n+=1
return n
# Renders the graph using Bokeh library
def displayGraph(G, title):
#Create a plot — set dimensions, toolbar, and title
plot = figure(tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
x_range=Range1d(-20.1, 20.1), y_range=Range1d(-20.1, 20.1), title=title)
#Create a network graph object with spring layout
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html
network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))
#Set node size and color
network_graph.node_renderer.glyph = Circle(size=20, fill_color='color')
#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)
hover_nodes = HoverTool(
tooltips=[("id", "@index"), ("name", "@name"), ("type", "@type")],
renderers=[network_graph.node_renderer]
)
hover_edges = HoverTool(
tooltips=[("name","@name"),("status","@status"), ("cnt", "@cnt"), ("firstseen", "@firstseen"), ("lastseen", "@lastseen")],
renderers=[network_graph.edge_renderer], line_policy="interp"
)
#Add network graph to the plot
plot.renderers.append(network_graph)
plot.add_tools(hover_edges, hover_nodes)
html = file_html(plot, CDN)
displayHTML(html)
# Generate the sql query for the graph search/navigation using a template
# Incorporates the same_as edges in the search
def generate_sql_query(node_filter, time_frame, same_as=True):
if same_as:
return f"""
WITH sub_matches AS (
select
*
from
{{tgt_db_name}}.v_edges_day
where
time_bkt = '{time_frame}'
AND ( sub_id = '{node_filter}'
or sub_name = '{node_filter}')
),
sub_same_as AS (
SELECT
s1.src,
s1.time_bkt,
s1.sub_type,
s1.sub_id,
s1.sub_name,
s1.pred,
s1.pred_status,
s1.obj_type,
s1.obj_id,
s1.obj_name,
NULL as first_seen,
NULL as last_seen,
NULL as cnt
FROM
sub_matches AS e1
JOIN {{tgt_db_name}}.same_as AS s1 ON e1.sub_id = s1.sub_id
),
obj_matches AS (
select
*
from
{{tgt_db_name}}.v_edges_day
where
time_bkt = '{time_frame}'
AND ( obj_id = '{node_filter}'
or obj_name = '{node_filter}' )
),
obj_same_as AS (
SELECT
s3.src,
s3.time_bkt,
s3.sub_type,
s3.sub_id,
s3.sub_name,
s3.pred,
s3.pred_status,
s3.obj_type,
s3.obj_id,
s3.obj_name,
NULL as first_seen,
NULL as last_seen,
NULL as cnt
FROM
obj_matches AS e3
JOIN {{tgt_db_name}}.same_as AS s3 ON e3.obj_id = s3.sub_id
)
SELECT
*
FROM
sub_matches
UNION
SELECT
e2.*
FROM
sub_same_as AS s2
JOIN {{tgt_db_name}}.v_edges_day AS e2 ON s2.obj_id = e2.sub_id AND e2.time_bkt = '{time_frame}'
UNION
SELECT
*
FROM
obj_matches
UNION
SELECT
e4.*
FROM
obj_same_as AS s4
JOIN {{tgt_db_name}}.v_edges_day AS e4 ON s4.obj_id = e4.obj_id AND e4.time_bkt = '{time_frame}'
UNION
SELECT
*
FROM
sub_same_as
UNION
SELECT
*
FROM
obj_same_as
"""
return """
select
*
from {{tgt_db_name}}.v_edges
where time_bkt = '{time_frame}'
AND ( sub_id = '{node_filter}'
OR sub_name = '{node_filter}'
OR obj_id = '{node_filter}'
OR obj_name = '{node_filter}'
OR sub_name ilike '%{node_filter}%'
OR obj_name ilike '%{node_filter}%'
)
"""
# COMMAND ----------
# DBTITLE 1,Reset/Initializes the graph visualization
import networkx as nx
G = nx.Graph()
g_df = None
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC ## Demo Setup
# MAGIC
# MAGIC You are an incident responder or an analyst investigating an advanced persistent threat. The following command simulates an investigation process that builds an investigation graph iteratively through a series of edge queries. You will build the investigation graph by changing the search parameter and executing the following command. To reset the investigation graph, you will need to execute the command prior to this cell.
# MAGIC
# MAGIC The output of the following command consists of three components:
# MAGIC
# MAGIC 1. The table of edges from the most recent search query
# MAGIC 2. The graph visualization
# MAGIC 3. The cumulative table of edges from the search history since the last reset.
# MAGIC
# MAGIC ## Demo Story
# MAGIC
# MAGIC * Suppose `maria.cole@chang-fisher.com` has been compromised. Let's pull back all the edges incident on that person.
# MAGIC * You can hover over the edges or nodes in the graph visualization to see details.
# MAGIC * It turns out that maria has an old account from a company Summers.info that the Chang-Fisher company has acquired.
# MAGIC * Moreover, maria got married and changed her lastname.
# MAGIC * Note all the IP addresses, apps, and resources that Maria has signed in from/to.
# MAGIC * This is an example of a `same_as` edge produced by an entity resolution package.
# MAGIC * Suppose we know from EDR alerts that `megan.chang@chang-fisher.com` is also compromised. Let's pull back her edges.
# MAGIC * Observe that there are common IP addresses, apps and resources shared by the two compromised accounts. Any of these could be the pivot point the actor used to harvest these accounts and credentials.
# MAGIC * It is likely now that all the IP addresses Megan is associated with has been compromised. Let's expand on the IP address `165.225.221.4`.
# MAGIC * Observe that Kyle Schultz also signs in from that IP address, so his account might be compromised too.
# MAGIC * Let's pull back all edges that Kyle is associated with.
# MAGIC * Now let your imagination flow a bit and imagine how we can use graph algorithms like Breadth First Search and Strongly connected components to perform automated impact analysis.
# COMMAND ----------
# DBTITLE 1,Navigate and build the investigation graph as part of the investigation process
# this query needs to be highly selective, so that it only brings back a small subset of edges to render
# it is foolish to try and render a 1M node graph
# note that dbutils.widget parametrization is an option that was tried, but the following is easier and more stable
time_frame = "2022-07-20T00:00:00.000+0000"
travel_node = "maria.cole@chang-fisher.com"
#travel_node = "megan.chang@chang-fisher.com"
#travel_node = "165.225.221.4"
#travel_node = "199.106.8.190"
#travel_node = "kyle.schultz@chang-fisher.com"
use_same_as = True
sql_str = generate_sql_query( travel_node, time_frame, use_same_as )
edges_df = spark.sql(sql_str)
if g_df is None:
g_df = edges_df
else:
g_df = g_df.union(edges_df)
# display edges from most recent query
display(edges_df)
add_edges_from_df(G, edges_df)
# display the investigation graph so far
displayGraph(G, "Identity Context")
# display the edges of the entire search history since last reset
display(g_df)
# COMMAND ----------