config.yml

# AWS and SageMaker settings
aws:
  region: us-east-1
  # execution role, replace the role name below with the one you are using
  sagemaker_execution_role_name: <your-sagemaker-execution-role>
  # the execution role ARN is determined automatically by the code
  sagemaker_execution_role_arn: arn:aws:iam::{account_id}:role/service-role/{role}
  s3_bucket: sagemaker-{region}-{account_id} # region and account id are automatically replaced
  s3_prefix: mlops-pipeline-model
  # view more information on the network configuration here: 
  # https://sagemaker.readthedocs.io/en/stable/api/utility/network.html
  network_config:
      enable_network_isolation: 
      # these are list of security groups and subnets
      # if you have these values configured, mention them
      # below
      security_group_ids: 
      subnets: 
      # Boolean that determines whether to encrypt inter-container traffic. 
      # Default value is None.
      encrypt_inter_container_traffic: 
 
presto:
  host: <your-presto-server-ip>
  parameter: "<your-presto-port-number"
  presto_credentials: <your-presto-server-credentials>
  catalog: <catalog-for-presto-server>
  schema: <schema-for-presto-server>
  
## User needs to configure the following
pipeline:
  training_pipeline_name: mlops-pipeline-presto
  transform_pipeline_name: mlops-batch-inference
  base_job_name: mlops-prestodb
  tags:
  - Key: team
    Value: my-team
  
training_step:
  training_target: high_value_order ## target name (the ML model is trained to predict this column)
  training_features:
  - total_extended_price
  - avg_discount
  - total_quantity ##, feature2, feature2, ... add more based on your training job, add more features here
  sklearn_framework_version: 0.23-1
  n_estimators: 75
  max_depth: 10
  min_samples_split: 2
  max_features: sqrt
  instance_type: ml.m5.xlarge
  instance_count: 1
  base_job_name: rf-sklearn
  train_split: 0.7
  test_split: 0.9
  tags:
  - Key: team
    Value: my-team
  query: |
    SELECT
        o.orderkey,
        COUNT(l.linenumber) AS lineitem_count,
        SUM(l.quantity) AS total_quantity,
        AVG(l.discount) AS avg_discount,
        SUM(l.extendedprice) AS total_extended_price,
        SUM(l.tax) AS total_payable_tax,
        o.orderdate,
        o.orderpriority,
        CASE
            WHEN (o.orderpriority = '2-HIGH') THEN 1
            ELSE 0
        END AS high_value_order
    FROM
        orders o
    JOIN
        lineitem l ON o.orderkey = l.orderkey
    GROUP BY
        o.orderkey,
        o.orderdate,
        o.orderpriority
    ORDER BY 
        RANDOM() 
    LIMIT 5000

tuning_step:
  step_name: Train-And-Tune-Model
  maximum_parallel_training_jobs: 1
  maximum_training_jobs: 2
  hyperparam_ranges:
    n_estimators:
    - 10
    - 150
    max_depth:
    - 3
    - 20
    min_samples_split:
    - 2
    - 10
    max_features:
    - sqrt
    - log2
  metric_definitions:
  - Name: 'validation:auc'
    Regex: 'auc (\S+)'
  objective_metric_name: "validation:auc"

evaluation_step:
  step_name: Evaluate-Model
  accuracy_condition_threshold: 0.60
  instance_type: ml.m5.xlarge
  instance_count: 1
  evaluation_filename: evaluation.json
  
transform_step:
  step_name: mlops-RandomForestTransform
  instance_type: ml.m5.xlarge
  instance_count: 1
  num_hours_to_go_back: 1
  output_prefix: batch_transform_output
  tags:
  - Key: team
    Value: my-team
  query: |
    SELECT
        o.orderkey,
        COUNT(l.linenumber) AS lineitem_count,
        SUM(l.quantity) AS total_quantity,
        AVG(l.discount) AS avg_discount,
        SUM(l.extendedprice) AS total_extended_price,
        SUM(l.tax) AS total_payable_tax,
        o.orderdate,
        o.orderpriority,
        CASE
            WHEN (o.orderpriority = '2-HIGH') THEN 1
            ELSE 0
        END AS high_value_order
    FROM
        orders o
    JOIN
        lineitem l ON o.orderkey = l.orderkey
    GROUP BY
        o.orderkey,
        o.orderdate,
        o.orderpriority
    ORDER BY 
        RANDOM() 
    LIMIT 5000


data_processing_step:
  step_name: "Preprocess-Data"
  processing_instance_type: ml.c5.xlarge
  instance_count: 1
  tags:
  - Key: team
    Value: my-team
  
register_model_step:
  step_name: Register-Model
  model_group: mlops-presto
  model_name: mlops-presto 
  approval_status: PendingManualApproval
  inference_instance_types:
  - ml.t2.medium
  - ml.m5.xlarge
  - ml.m5.large
  transform_instance_types:
  - ml.m5.xlarge
  tags:
  - Key: team
    Value: my-team

fail_step:
  step_name: AccuracyThresholdFailed

condition_step:
  step_name: Accuracy-Condition

realtime_endpoint:
  endpoint_config_name: random-forest-classifier
  endpoint_name: mlops-realtime-ep
  instance_type: ml.m5.xlarge
  min_instance_count: 1
  max_instance_count: 3

## section that enables container to run notebooks and python scripts automatically 
scripts:
    source_dir: code ## represents the source directory containing all of the data preprocessing scripts
    query: query.py
    preprocess_data: presto_preprocess_for_training.py ## represents the pre processing script 
    evaluation: code/evaluate.py ## represents the evaluation script for the evaluate step
    batch_transform_get_data: presto_preprocess_for_batch_inference.py ## data prep for batch transform
    batch_inference: code/inference.py ## represents training script with inference logic for batch transform
    training_script: code/training.py ## represents the training script