-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprofiling_summary.py
153 lines (126 loc) · 5.39 KB
/
profiling_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import typing
import streamlit as st
import testgen.ui.services.database_service as db
import testgen.ui.services.form_service as fm
import testgen.ui.services.query_service as dq
import testgen.ui.services.toolbar_service as tb
from testgen.common import date_service
from testgen.ui.navigation.menu import MenuItem
from testgen.ui.navigation.page import Page
from testgen.ui.session import session
FORM_DATA_WIDTH = 400
class DataProfilingPage(Page):
path = "profiling"
can_activate: typing.ClassVar = [
lambda: session.authentication_status or "login",
]
menu_item = MenuItem(icon="problem", label="Data Profiling", order=1)
def render(self) -> None:
fm.render_page_header(
"Profiling Runs",
"https://docs.datakitchen.io/article/dataops-testgen-help/investigate-profiling",
lst_breadcrumbs=[
{"label": "Overview", "path": "overview"},
{"label": "Data Profiling", "path": None},
],
boo_show_refresh=True,
)
if "project" not in st.session_state:
st.write("Select a Project from the Overview page.")
else:
str_project = st.session_state["project"]
# Setup Toolbar
tool_bar = tb.ToolBar(3, 2, 0, None)
with tool_bar.long_slots[0]:
# Table Groups selection -- optional criterion
df_tg = get_db_table_group_choices(str_project)
str_table_groups_id = fm.render_select(
"Table Group", df_tg, "table_groups_name", "id", boo_required=False, str_default=None
)
df, show_columns = get_db_profiling_runs(str_project, str_table_groups_id)
time_columns = ["start_time"]
date_service.accommodate_dataframe_to_timezone(df, st.session_state, time_columns)
dct_selected_rows = fm.render_grid_select(df, show_columns)
open_drill_downs(dct_selected_rows, tool_bar.short_slots)
if dct_selected_rows:
show_record_detail(dct_selected_rows[0])
st.markdown(":orange[Click a button to view profiling outcomes for the selected run.]")
else:
st.markdown(":orange[Select a run to see more information.]")
@st.cache_data(show_spinner=False)
def get_db_table_group_choices(str_project_code):
str_schema = st.session_state["dbschema"]
return dq.run_table_groups_lookup_query(str_schema, str_project_code)
@st.cache_data(show_spinner="Retrieving Data")
def get_db_profiling_runs(str_project_code, str_tg=None):
str_schema = st.session_state["dbschema"]
str_tg_condition = f" AND table_groups_id = '{str_tg}' " if str_tg else ""
str_sql = f"""
SELECT project_code, connection_name,
connection_id::VARCHAR,
table_groups_id::VARCHAR,
profiling_run_id::VARCHAR,
table_groups_name, schema_name, start_time, duration,
CASE
WHEN status = 'Running' AND start_time < CURRENT_DATE - 1 THEN 'Error'
ELSE status
END as status,
COALESCE(log_message, '(No Errors)') as log_message,
table_ct, column_ct,
anomaly_ct, anomaly_table_ct, anomaly_column_ct
FROM {str_schema}.v_profiling_runs
WHERE project_code = '{str_project_code}' {str_tg_condition}
ORDER BY start_time DESC;
"""
show_columns = [
"connection_name",
"table_groups_name",
"schema_name",
"start_time",
"duration",
"status",
"table_ct",
"column_ct",
]
return db.retrieve_data(str_sql), show_columns
def open_drill_downs(dct_selected_rows, button_slots):
dct_selected_row = None
if dct_selected_rows:
dct_selected_row = dct_selected_rows[0]
if button_slots[0].button(
f":{'gray' if not dct_selected_rows else 'green'}[Profiling →]",
help="Review profiling characteristics for each data column",
use_container_width=True,
disabled=not dct_selected_rows,
):
st.session_state["drill_profile_run"] = dct_selected_row["profiling_run_id"]
session.current_page = "profiling/results"
session.current_page_args = {}
st.experimental_rerun()
if button_slots[1].button(
f":{'gray' if not dct_selected_rows else 'green'}[Anomalies →]",
help="Review potential data problems identified in profiling",
use_container_width=True,
disabled=not dct_selected_rows,
):
st.session_state["drill_profile_run"] = dct_selected_row["profiling_run_id"]
st.session_state["drill_profile_tg"] = dct_selected_row["table_groups_id"]
session.current_page = "profiling/anomalies"
session.current_page_args = {}
st.experimental_rerun()
def show_record_detail(dct_selected_row):
layout_column_1, _ = st.columns([0.5, 0.5])
with layout_column_1:
str_header = "Profiling Run Information"
lst_columns = [
"connection_name",
"table_groups_name",
"schema_name",
"log_message",
"table_ct",
"column_ct",
"anomaly_ct",
"anomaly_table_ct",
"anomaly_column_ct",
]
fm.render_html_list(dct_selected_row, lst_columns, str_header, FORM_DATA_WIDTH)