-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconfig.yml
193 lines (180 loc) · 5.53 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# AWS and SageMaker settings
aws:
region: us-east-1
# execution role, replace the role name below with the one you are using
sagemaker_execution_role_name: <your-sagemaker-execution-role>
# the execution role ARN is determined automatically by the code
sagemaker_execution_role_arn: arn:aws:iam::{account_id}:role/service-role/{role}
s3_bucket: sagemaker-{region}-{account_id} # region and account id are automatically replaced
s3_prefix: mlops-pipeline-model
# view more information on the network configuration here:
# https://sagemaker.readthedocs.io/en/stable/api/utility/network.html
network_config:
enable_network_isolation:
# these are list of security groups and subnets
# if you have these values configured, mention them
# below
security_group_ids:
subnets:
# Boolean that determines whether to encrypt inter-container traffic.
# Default value is None.
encrypt_inter_container_traffic:
presto:
host: <your-presto-server-ip>
parameter: "<your-presto-port-number"
presto_credentials: <your-presto-server-credentials>
catalog: <catalog-for-presto-server>
schema: <schema-for-presto-server>
## User needs to configure the following
pipeline:
training_pipeline_name: mlops-pipeline-presto
transform_pipeline_name: mlops-batch-inference
base_job_name: mlops-prestodb
tags:
- Key: team
Value: my-team
training_step:
training_target: high_value_order ## target name (the ML model is trained to predict this column)
training_features:
- total_extended_price
- avg_discount
- total_quantity ##, feature2, feature2, ... add more based on your training job, add more features here
sklearn_framework_version: 0.23-1
n_estimators: 75
max_depth: 10
min_samples_split: 2
max_features: sqrt
instance_type: ml.m5.xlarge
instance_count: 1
base_job_name: rf-sklearn
train_split: 0.7
test_split: 0.9
tags:
- Key: team
Value: my-team
query: |
SELECT
o.orderkey,
COUNT(l.linenumber) AS lineitem_count,
SUM(l.quantity) AS total_quantity,
AVG(l.discount) AS avg_discount,
SUM(l.extendedprice) AS total_extended_price,
SUM(l.tax) AS total_payable_tax,
o.orderdate,
o.orderpriority,
CASE
WHEN (o.orderpriority = '2-HIGH') THEN 1
ELSE 0
END AS high_value_order
FROM
orders o
JOIN
lineitem l ON o.orderkey = l.orderkey
GROUP BY
o.orderkey,
o.orderdate,
o.orderpriority
ORDER BY
RANDOM()
LIMIT 5000
tuning_step:
step_name: Train-And-Tune-Model
maximum_parallel_training_jobs: 1
maximum_training_jobs: 2
hyperparam_ranges:
n_estimators:
- 10
- 150
max_depth:
- 3
- 20
min_samples_split:
- 2
- 10
max_features:
- sqrt
- log2
metric_definitions:
- Name: 'validation:auc'
Regex: 'auc (\S+)'
objective_metric_name: "validation:auc"
evaluation_step:
step_name: Evaluate-Model
accuracy_condition_threshold: 0.60
instance_type: ml.m5.xlarge
instance_count: 1
evaluation_filename: evaluation.json
transform_step:
step_name: mlops-RandomForestTransform
instance_type: ml.m5.xlarge
instance_count: 1
num_hours_to_go_back: 1
output_prefix: batch_transform_output
tags:
- Key: team
Value: my-team
query: |
SELECT
o.orderkey,
COUNT(l.linenumber) AS lineitem_count,
SUM(l.quantity) AS total_quantity,
AVG(l.discount) AS avg_discount,
SUM(l.extendedprice) AS total_extended_price,
SUM(l.tax) AS total_payable_tax,
o.orderdate,
o.orderpriority,
CASE
WHEN (o.orderpriority = '2-HIGH') THEN 1
ELSE 0
END AS high_value_order
FROM
orders o
JOIN
lineitem l ON o.orderkey = l.orderkey
GROUP BY
o.orderkey,
o.orderdate,
o.orderpriority
ORDER BY
RANDOM()
LIMIT 5000
data_processing_step:
step_name: "Preprocess-Data"
processing_instance_type: ml.c5.xlarge
instance_count: 1
tags:
- Key: team
Value: my-team
register_model_step:
step_name: Register-Model
model_group: mlops-presto
model_name: mlops-presto
approval_status: PendingManualApproval
inference_instance_types:
- ml.t2.medium
- ml.m5.xlarge
- ml.m5.large
transform_instance_types:
- ml.m5.xlarge
tags:
- Key: team
Value: my-team
fail_step:
step_name: AccuracyThresholdFailed
condition_step:
step_name: Accuracy-Condition
realtime_endpoint:
endpoint_config_name: random-forest-classifier
endpoint_name: mlops-realtime-ep
instance_type: ml.m5.xlarge
min_instance_count: 1
max_instance_count: 3
## section that enables container to run notebooks and python scripts automatically
scripts:
source_dir: code ## represents the source directory containing all of the data preprocessing scripts
query: query.py
preprocess_data: presto_preprocess_for_training.py ## represents the pre processing script
evaluation: code/evaluate.py ## represents the evaluation script for the evaluate step
batch_transform_get_data: presto_preprocess_for_batch_inference.py ## data prep for batch transform
batch_inference: code/inference.py ## represents training script with inference logic for batch transform
training_script: code/training.py ## represents the training script