diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..8ec05f20 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +# Owners of repository +/.github/CODEOWNERS @calcof +/README.md @calcof + +# Owners of docs +/docs/ @calcof @hjscherer @sebader @msimecek @heoelri @nielsams \ No newline at end of file diff --git a/.gitignore b/.gitignore index dfcfd56f..4120ce53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,350 +1,2 @@ -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. -## -## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore - -# User-specific files -*.rsuser -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Mono auto generated files -mono_crash.* - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -[Aa][Rr][Mm]/ -[Aa][Rr][Mm]64/ -bld/ -[Bb]in/ -[Oo]bj/ -[Ll]og/ -[Ll]ogs/ - -# Visual Studio 2015/2017 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# Visual Studio 2017 auto generated files -Generated\ Files/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUnit -*.VisualState.xml -TestResult.xml -nunit-*.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# Benchmark Results -BenchmarkDotNet.Artifacts/ - -# .NET Core -project.lock.json -project.fragment.lock.json -artifacts/ - -# StyleCop -StyleCopReport.xml - -# Files built by Visual Studio -*_i.c -*_p.c -*_h.h -*.ilk -*.meta -*.obj -*.iobj -*.pch -*.pdb -*.ipdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*_wpftmp.csproj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opendb -*.opensdf -*.sdf -*.cachefile -*.VC.db -*.VC.VC.opendb - -# Visual Studio profiler -*.psess -*.vsp -*.vspx -*.sap - -# Visual Studio Trace Files -*.e2e - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# AxoCover is a Code Coverage Tool -.axoCover/* -!.axoCover/settings.json - -# Visual Studio code coverage results -*.coverage -*.coveragexml - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# Note: Comment the next line if you want to checkin your web deploy settings, -# but database connection strings (with potential passwords) will be unencrypted -*.pubxml -*.publishproj - -# Microsoft Azure Web App publish settings. Comment the next line if you want to -# checkin your Azure Web App publish settings, but sensitive information contained -# in these scripts will be unencrypted -PublishScripts/ - -# NuGet Packages -*.nupkg -# NuGet Symbol Packages -*.snupkg -# The packages folder can be ignored because of Package Restore -**/[Pp]ackages/* -# except build/, which is used as an MSBuild target. -!**/[Pp]ackages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/[Pp]ackages/repositories.config -# NuGet v3's project.json files produces more ignorable files -*.nuget.props -*.nuget.targets - -# Microsoft Azure Build Output -csx/ -*.build.csdef - -# Microsoft Azure Emulator -ecf/ -rcf/ - -# Windows Store app package directories and files -AppPackages/ -BundleArtifacts/ -Package.StoreAssociation.xml -_pkginfo.txt -*.appx -*.appxbundle -*.appxupload - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!?*.[Cc]ache/ - -# Others -ClientBin/ -~$* -*~ -*.dbmdl -*.dbproj.schemaview -*.jfm -*.pfx -*.publishsettings -orleans.codegen.cs - -# Including strong name files can present a security risk -# (https://github.com/github/gitignore/pull/2483#issue-259490424) -#*.snk - -# Since there are multiple workflows, uncomment next line to ignore bower_components -# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) -#bower_components/ - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm -ServiceFabricBackup/ -*.rptproj.bak - -# SQL Server files -*.mdf -*.ldf -*.ndf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings -*.rptproj.rsuser -*- [Bb]ackup.rdl -*- [Bb]ackup ([0-9]).rdl -*- [Bb]ackup ([0-9][0-9]).rdl - -# Microsoft Fakes -FakesAssemblies/ - -# GhostDoc plugin setting file -*.GhostDoc.xml - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat -node_modules/ - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) -*.vbw - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - -# Paket dependency manager -.paket/paket.exe -paket-files/ - -# FAKE - F# Make -.fake/ - -# CodeRush personal settings -.cr/personal - -# Python Tools for Visual Studio (PTVS) -__pycache__/ -*.pyc - -# Cake - Uncomment if you are using it -# tools/** -# !tools/packages.config - -# Tabs Studio -*.tss - -# Telerik's JustMock configuration file -*.jmconfig - -# BizTalk build output -*.btp.cs -*.btm.cs -*.odx.cs -*.xsd.cs - -# OpenCover UI analysis results -OpenCover/ - -# Azure Stream Analytics local run output -ASALocalRun/ - -# MSBuild Binary and Structured Log -*.binlog - -# NVidia Nsight GPU debugger configuration file -*.nvuser - -# MFractors (Xamarin productivity tool) working folder -.mfractor/ - -# Local History for Visual Studio -.localhistory/ - -# BeatPulse healthcheck temp database -healthchecksdb - -# Backup folder for Package Reference Convert tool in Visual Studio 2017 -MigrationBackup/ - -# Ionide (cross platform F# VS Code tools) working folder -.ionide/ +## Folders +.vscode \ No newline at end of file diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md new file mode 100644 index 00000000..58b0b48a --- /dev/null +++ b/CONTRIBUTE.md @@ -0,0 +1,32 @@ +# How to Contribute to AlwaysOn + +## Content Changes and Pull Requests + +To add or edit content within the AlwaysOn repositories, please take a fork of a repository to iterate on changes before subsequently opening a Pull Request (PR) to get your forked branch merged into the main branch for that AlwaysOn repository. Your PR will be reviewed by core engineers working on the AlwaysOn project, and once approved, your content accessible to everybody. + +> **Important!** Please make sure that your PR is focused on a specific area of AlwaysOn to facilitate a targeted review, as this will speed up the process to get your changes merged into our repository. + +## Content Structure + +[![AlwaysOn Repo Structure](/docs/media/alwayson-repo-structure.png "AlwaysOn Repo Structure")](./CONTRIBUTE.md) + +The AlwaysOn project is separated into **3** different repositories: + +- [AlwaysOn](/docs/README.md): contains the AlwaysOn design methodology, covering the design pattern ad approach to guide readers to defining a target AlwaysOn architecture. + - Overarching topics are documented as separate markdown documents within the `/docs/` directory. + +- [AlwaysOn-Foundational-Online](http://github.com/azure/alwayson-foundational-online): contains the AlwaysOn foundational reference implementation intended for online scenarios that are public-facing and do not require private network connectivity to a surrounding organizational technical estate. + - [`/docs/`](https://github.com/Azure/alwayson-foundational-online/tree/main/docs) contains the majority of documentation, covering the design approach and detailed documentation to accompany the reference implementation. + - [`/src/`](https://github.com/Azure/alwayson-foundational-online/tree/main/src) contains all source code and technical artifacts for the reference implementation along with low level implementation documentation. + - [`/.ado/pipelines`](https://github.com/Azure/alwayson-foundational-online/tree/main/.ado/pipelines) contains the Azure DevOps pipelines to build and deploy the reference implementation. + +- [AlwaysOn-Foundational-Connected](http://github.com/azure/alwayson-foundational-connected): contains the AlwaysOn foundational reference implementation intended for private scenarios that require integration with an organizational technical estate for either public-facing or internal-facing workloads. + - [`/docs/`](http://github.com/azure/alwayson-foundational-connected/tree/main/docs) contains the majority of documentation, covering the design approach and detailed documentation to accompany the reference implementation. + - [`/src/`](http://github.com/azure/alwayson-foundational-connected/tree/main/src) contains all source code and technical artifacts for the reference implementation along with low level implementation documentation. + - [`/.ado/pipelines`](http://github.com/azure/alwayson-foundational-connected/tree/main/.ado/pipelines) contains the Azure DevOps pipelines to build and deploy the reference implementation. + +## Documentation Conventions + +Each source code component within the reference implementation repositories has it's own `README.md` file which explains how that particular component works, how it is supposed to be used, and how it may interact with other aspects of the AlwaysOn solution. + +Within the `main` branch, each `README.md` file must accurately represent the state of the associated component which will serve as a core aspect of PR reviews. Any modifications to source components must therefore be reflected in the documentation as well. diff --git a/FAQ.md b/FAQ.md new file mode 100644 index 00000000..61cf770c --- /dev/null +++ b/FAQ.md @@ -0,0 +1,38 @@ +# Frequently Asked Questions (FAQ) + +## Design Methodology + +> I need to design a highly reliable and mission-critical application on Azure. Where can I learn more about AlwaysOn design methodology so? + +The AlwaysOn design principles and design areas are published within this repository and you can learn more about how to use the design methodology [here](/docs/design-methodology/README.md). +The AlwaysOn design methodology is also available within the [Microsoft Azure Architecture Center](https://docs.microsoft.com/azure/architecture/mission-critical/alwayson-overview) for general consumption. + +## Reference Implementations + +> Can the reference implementations be used in any Azure environment without any restrictions? + +The reference implementations is published under the [MIT open source license](/LICENSE) and can be used *as is*. +The AlwaysOn engineering team is constantly improving the code and actively encourage community [contributions](/CONTRIBUTE.md). All reference implementations have been rigorously tested on *Azure Public* cloud infrastructure. + +### Deployment + +> How is the infrastructure getting deployed? + +The application infrastructure is deployed using Terraform. Other approaches such as ARM Templates and Bicep can also be used, but are not yet implemented within the repositories. + +### Patching & Updates + +> How is the infrastructure getting updated? + +Most infrastructure components used for AlwaysOn are PaaS services and are maintained by Microsoft. +Some services, such as Azure Kubernetes Service (AKS) require dedicated maintenance activities, and for AKS this is achieved via [automatic node image upgrades](https://docs.microsoft.com/azure/aks/upgrade-cluster#set-auto-upgrade-channel) in combination with [planned maintenance windows](https://docs.microsoft.com/azure/aks/planned-maintenance) to automatically update the nodes to the most recent AKS node OS image. Larger changes, such as an upgrade of the K8s version are performed as-code by changing the K8s version within the reference implementation file `.ado/pipeline/config/configuration.yaml` and re-running the infrastructure pipeline. + +### Security + +> What is used to store secrets? + +Wherever possible, Azure Managed Identities are used to avoid exposing any sensitive values like Service Principal client secrets (password). +All secrets are stored in Azure Key Vault at deployment time via Terraform. These secrets are then loaded into Azure Kubernetes Service as Kubernetes secrets (and where required as environment variables in the pods) or handed over at deployment time as parameters for helm charts etc. Some temporary secrets, such as SSL/TLS certificates managed by *cert-manager*, are stored within the Kubernetes cluster only. + +--- +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/README.md b/README.md index 5cd7cecf..e52fe9ca 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,48 @@ -# Project +[![Azure AlwaysOn](./icon.png "Azure AlwaysOn")](./README.md) -> This repo has been populated by an initial template to help get you started. Please -> make sure to update the content to build a great experience for community-building. +## Welcome to Azure AlwaysOn -As the maintainer of this project, please make a few updates: +AlwaysOn is an open source project that provides a **prescriptive architectural approach to building highly-reliable cloud-native applications on Microsoft Azure for mission-critical workloads**. More specifically, this repository contains everything required to understand and implement an 'always on' application on Microsoft Azure, and is comprised of the following: -- Improving this README.MD file to provide a great experience -- Updating SUPPORT.MD with content about this project's support experience -- Understanding the security reporting process in SECURITY.MD -- Remove this section from the README +1. **Architectural Guidelines**: cloud-native design methodology to guide readers through the architectural process of building a mature mission-critical application on Microsoft Azure, articulating key design considerations and requisite design decisions along with associated trade-offs. + +2. **Fully Functional Reference Implementations**: end-to-end reference implementations intended to provide a solution orientated basis to showcase mission-critical application development on Microsoft Azure, leveraging Azure-native platform capabilities to maximize reliability and operational effectiveness. + - Design and implementation guidance to help readers understand and use the AlwaysOn design methodology in the context of a particular scenario. + - Production-ready technical artifacts including Infrastructure-as-Code (IaC) resources and Continuous-Integration/Continuous-Deployment (CI/CD) pipelines (GitHub and Azure DevOps) to deploy an AlwaysOn application with mature end-to-end operational wrappers. + +## AlwaysOn | Navigation + +- [Introduction | What is AlwaysOn?](./docs/introduction/README.md) - Detailed introduction into AlwaysOn, the problem it is intended to solve and the value it can provide. + +- [Design Methodology | AlwaysOn Architectural Approach](./docs/design-methodology/README.md) - Prescriptive guidance aligned to 8 critical design areas guides users to design and build an AlwaysOn application, outlining a recommended decision process. + +- [Foundational Reference Implementation | Online](https://github.com/azure/alwayson-foundational-online) - Everything required to understand and build a copy of the foundational reference implementation intended for online scenarios that are public-facing and do not require private network connectivity to a surrounding organizational technical estate. + +- [Foundational Reference Implementation | Connected](https://github.com/azure/alwayson-foundational-connected) - Everything required to understand and build a copy of the foundational reference implementation intended for private scenarios that require integration with an organizational technical estate for either public-facing or internal-facing workloads. + +## Helpful Information + +The foundational reference implementations are separated within dedicated repositories containing all relevant documentation and technical artifacts, along with a *getting started guide*: + - [Foundational-Online Reference Implementation](https://github.com/Azure/AlwaysOn-foundational-online/blob/main/docs/reference-implementation/Getting-Started.md) + - [Foundational-Connected Reference Implementation](https://github.com/Azure/AlwaysOn-foundational-connected/blob/main/docs/reference-implementation/Getting-Started.md) + +[![AlwaysOn Repo Structure](/docs/media/alwayson-repo-structure.png "AlwaysOn Repo Structure")](./CONTRIBUTE.md) + +> A list of [Frequently Asked Questions](./docs/FAQ.md) is provided to capture common issues and challenges associated with using the AlwaysOn project. ## Contributing -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. +AlwaysOn is a community driven open source project that welcomes contributions as well as suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit the [CLA portal](https://cla.opensource.microsoft.com). -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g. status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -## Trademarks +For more details, please read [how to contribute](./CONTRIBUTE.md). + +## Microsoft Sponsorship -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow -[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). -Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. -Any use of third-party trademarks or logos are subject to those third-party's policies. +The AlwaysOn project was created by the **Microsoft Customer Architecture Team (CAT)** who continue to actively sponsor the sustained evolution of the AlwaysOn project through the creation of additional reference implementations for common industry scenarios. diff --git a/SECURITY.md b/SECURITY.md index f7b89984..8748b27b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,6 +1,6 @@ -## Security +# Security Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). @@ -14,17 +14,17 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue +* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) +* Full paths of source file(s) related to the manifestation of the issue +* The location of the affected source code (tag/branch/commit or direct URL) +* Any special configuration required to reproduce the issue +* Step-by-step instructions to reproduce the issue +* Proof-of-concept or exploit code (if possible) +* Impact of the issue, including how an attacker might exploit the issue This information will help us triage your report more quickly. diff --git a/SUPPORT.md b/SUPPORT.md index dc72f0e5..c53a0fc3 100644 --- a/SUPPORT.md +++ b/SUPPORT.md @@ -1,25 +1,18 @@ -# TODO: The maintainer of this repo has not yet edited this file -**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? +# Support -- **No CSS support:** Fill out this template with information about how to file issues and get help. -- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). -- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. +## Microsoft Support Policy -*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* +Issues encountered when deploying the AlwaysOn reference implementations that are related to deployment failures or the operation of a resource an be raised with Microsoft support via their usual channels. -# Support +For any issues outside of this context and/or issues requiring a bugfix to technical artefacts located within this repository, Microsoft support will redirect user to file the issue on GitHub. -## How to file issues and get help +## Community Support Policy -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. +The project maintainers and community aim to resolve issues swiftly, and are committed to provide a meaningful response to any new issues within 3 business days. -For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE -FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER -CHANNEL. WHERE WILL YOU HELP PEOPLE?**. +## How to file issues and get help -## Microsoft Support Policy +This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. -Support for this **PROJECT or PRODUCT** is limited to the resources listed above. +For help and questions about using the AlwaysOn project and reference implementations, please submit a GitHub issue within the corresponding repository. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..d795137e --- /dev/null +++ b/docs/README.md @@ -0,0 +1,34 @@ +# AlwaysOn | Documentation Inventory + +- [Landing Page](/README.md) + +## Introduction + +- [What is AlwaysOn?](./introduction/README.md) + +## Design Methodology + +- [How to use the AlwaysOn Design Methodology](./design-methodology/README.md) +- [Design Principles](./design-methodology/Principles.md) +- [Design Areas](./design-methodology/Design-Areas.md) + - [Reference Architecture](./design-methodology/Design-Areas.md#reference-architecture) + - [Cross Cutting Concerns](./design-methodology/Design-Areas.md#cross-cutting-concerns) + - [Application Design](./design-methodology/App-Design.md) + - [Application Platform](./design-methodology/App-Platform.md) + - [Data Platform](./design-methodology/Data-Platform.md) + - [Health Modeling and Observability](./design-methodology/Health-Modeling.md) + - [Deployment and Testing](./design-methodology/Deployment-Testing.md) + - [Networking and Connectivity](./design-methodology/Networking.md) + - [Security](./design-methodology/Security.md) + - [Operational Procedures](./design-methodology/Operational-Procedures.md) + +## Reference Implementations + +- [AlwaysOn-Foundational-Online](http://github.com/azure/alwayson-foundational-online) +- [AlwaysOn-Foundational-Connected](http://github.com/azure/alwayson-foundational-connected) + +--- + +## Documentation Conventions + +- Overarching topics concerning the AlwaysOn architecture, design principles, design decisions, and cross-component integration are documented as separate markdown documents within the `/docs/` folder. diff --git a/docs/design-methodology/App-Design.md b/docs/design-methodology/App-Design.md new file mode 100644 index 00000000..519e52d6 --- /dev/null +++ b/docs/design-methodology/App-Design.md @@ -0,0 +1,319 @@ +# Application Design + +Both functional application requirements and non-functional requirements, such as those surrounding high-availability and performance, are critical to inform key design decisions for an AlwaysOn application design. However, these requirements should be examined alongside key cloud application design patterns to ensure AlwaysOn aspirations are fully achieved. This design area will therefore explore requisite application design patterns for building a 'best of breed' reliable application on Azure. + +There are ultimately a myriad of design patterns which can be applied to build reliable applications on Azure, from reliable virtual actors to circuit breaker for fault handling. This design area is not intended to cover all relevant patterns or substitute public [design pattern documentation](https://docs.microsoft.com/azure/architecture/patterns/), but rather strives to cover key requisite themes to create an AlwaysOn application design, establishing breadcrumbs for the reader to follow as they embark on a critical design path to define a target 'north star' architecture. + + - [Scale-Unit Architecture](#scale-unit-architecture) + - [Global Distribution](#global-distribution) + - [Loose Coupled Event-Driven Architecture](#loose-coupled-event-driven-architecture) + - [Application-Level Resiliency Patterns and Error Handling](#application-level-resiliency-patterns-and-error-handling) + +> The [foundational-online](https://github.com/Azure/AlwaysOn-Foundational-Online) and [foundational-connected](https://github.com/Azure/AlwaysOn-Foundational-Connected) reference implementations provide solution orientated showcases for how these foundational design concepts can be leveraged alongside Azure-native capabilities to maximize reliability. + +## Scale-Unit Architecture + +A scale-unit is a logical unit or function which can be scaled independently as required; it is a vital concept for achieving an AlwaysOn application design since all functional aspects of the solution must be capable of scaling to meet changes in demand. Architecturally, it is critical to optimize end-to-end scalability through the logical compartmentalization of operational functions into scale-units at all levels of the application stack, from code components to application hosting platforms and deployment stamps encompassing related components. + +> Please refer to the [deployment stamps pattern](https://docs.microsoft.com/azure/architecture/patterns/deployment-stamp) for further details. + +For example, the foundational reference implementation considers a user flow for processing game results which encompasses APIs for retrieving and posting game outcomes as well as supporting components such as an OAuth endpoint, datastore, and message queues. These stateless API endpoints for retrieving and posting results represent granular functional units that must be able to adapt to changes in demand, however, for these to be truly scalable the underlying application platform must also be able to scale in-kind. Similarly, to avoid performance bottlenecks in the end-to-end user flow and achieve sustainable scale, the downstream components and dependencies must also be able to scale to an appropriate degree, either independently as a separate scale-unit or together as part of a single logical unit. + +[![AlwaysOn Scale Units](/docs/media/alwayson-scale-units.png "AlwaysOn Scale Units")](./App-Design.md) + +The image above depicts the multiple scale-unit scopes considered by this reference implementation user flow, from microservice pods to cluster nodes and regional deployment stamps. + +Ultimately, a scale-unit architecture should be applied to optimize the end-to-end scalability of an AlwaysOn application design, so that all levels of the solution can appropriately scale. The relationship between related scale-units as well as components inside a single scale-unit should be defined according to a capacity model, taking into consideration non-functional requirements around performance. + +### Design Considerations + +- [Azure Subscription scale limits and quotas](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits) might have a bearing on application design, technology choices, and the definition of scale-units. + +- The scale-unit architecture pattern goes great lengths to address scale limits of individual resources and the application as a whole. + +- A scale-unit architecture helps with complex deployment and update scenarios since an entire regional stamp can be deployed as one unit, allowing for testing and validation of specific versions of components together prior to directing traffic to it. + +- The scale-unit architectural pattern can also be applied to support multi-tenant requirements for customer segregation. + +- The expected peak request rate (requests per second) and daily/weekly/seasonal traffic patterns are critical to inform core scale requirements. + - *How many requests is the solution required to support for each user flow? and are usage patterns predictable?* + +- The expected growth patterns for both traffic and data volume inform the design with regards to sustainable scale. + - *Is traffic expected to grow? and at what rate?* + +- The required performance of the solution under load is a critical decision factor when modelling required capacity. + - *Is a degraded service with high response times acceptable under load?* + +### Design Recommendations + +- Define a scale-unit when the scale-limits of a single deployment are likely to be exceeded. + +- Ensure all application components are able to scale either as independent scale-units or as part of a logical scale-unit encompassing multiple related components. + +- Define the relationship between scale-units according to a capacity model and non-functional requirements. + +- **Define a regional deployment stamp** to unify the provisioning, management, and operation of regional application resources into a heterogenous but inter-dependent scale-unit. + - As load increases, additional stamps can be deployed within the same or different Azure regions to horizontally scale the solution. + +> When deploying an AlwaysOn solution within an Azure Landing Zone, ensure the Landing Zone subscription is dedicated to the application to provide a clear management boundary and avoid potential 'noisy neighbor' capacity risks. + +- For high-scale application scenarios with significant volumes of traffic, design the solution to scale across multiple Azure Subscriptions to ensure the inherit scale-limits within a single subscription do not constrain scalability. + - Define a subscription scoped deployment as a scale-unit to avoid a 'spill-and-fill' subscription model. + - Deploy each regional deployment stamp within a dedicated subscription to ensure subscription limits only apply within the context of a single deployment stamp and not across the application as a whole. + - Where appropriate, multiple deployment stamps can be considered within a single region, but should be deployed across independent subscriptions. + - Separate 'global' shared resources within a dedicated subscription to allow for consistent regional subscription deployment; avoid having a specialized deployment for a 'primary' region. +> The use of multiple subscriptions necessitates additional CI/CD complexity which must be appropriately managed. It is therefore only recommended in extreme scale scenarios where the limits of a single subscription are likely to become a hindrance. + +- Where multiple production subscriptions are needed to ensure requisite scale, consider using a dedicated application Management Group to simplify policy assignment through a policy aggregation boundary. + +- Deploy any considered environments, such as production, development, or test environments, into separate subscriptions to ensure lower environments do not contribute towards scale limits, and to reduce the risk of lower environment updates polluting production by providing a clear management and identity boundary. + +- Define and analyze non-functional requirements, such as the availability SLO, within the context of key end-to-end user-flows since technical and business scenarios will likely have distinct considerations for resilience, availability, latency, capacity, and observability. + - This will allow for relative flexibility in the AlwaysOn design approach, tailoring design decisions and technology choices at a user-flow level since one size may not fit all. + +- Model required capacity around identified traffic patterns to ensure sufficient capacity provisioned at peak times to prevent service degradation. + - Leverage traffic patterns to optimize capacity and resource utilization during periods of reduced traffic. + +- Measure the time it takes to perform scale-out and scale-in operations to ensure natural variations in traffic do not create an unacceptable level of service degradation. + - Track scale operation durations as an operational metric to drive continuous improvement. + +### Reference Subscription Scale-Unit Approach + +The image below demonstrates how the single subscription reference deployment model can be expanded across multiple subscriptions in extreme scale scenario to navigate subscription scale-limits. + +[![AlwaysOn Subscription Scale Units](/docs/media/alwayson-subscription-scale.gif "AlwaysOn Subscription Scale Units")](./App-Design.md) + +## Global Distribution + +Unfortunately, failure is impossible to avoid in a highly distributed environment like Azure. Only by planning for failure can a solution be truly AlwaysOn. + +[Availability Zones](/azure/availability-zones/az-overview#availability-zones) (AZ) allows highly-available regional deployments across different data centers within a region. Nearly all Azure services are available in either a zonal configuration (where service is pinned to a specific zone) or zone-redundant configuration (where the platform automatically ensures the service spans across zones and can withstand a zone outage). These configurations allow for fault-tolerance up to a datacenter level. + +While Availability Zones can be used to mitigate many fault scenarios, to maximize reliability multiple Azure regions should be used to ensure regional fault tolerance, so that application availability remains even in the event of a disaster scenario, such as Godzilla stepping on an Azure region. When defining a multi-region AlwaysOn application design, consideration should be given to different deployment strategies, such as active-active and active-passive, alongside application requirements, since there are significant trade-offs between each approach. + +An active-active deployment strategy represents the gold standard for an AlwaysOn solution, since it maximizes availability and allows for higher composite SLAs. While active-active is the recommended approach, it can introduce challenges around data synchronization and consistency for many application scenarios, and these challenges must be fully addressed at a data platform level, alongside additional trade-offs, from increased cost exposure and increased engineering effort. + +Not every workload supports or requires multiple regions running simultaneously, and hence the precise application requirements should be weighed against these trade-offs to inform an optimal design decision. For certain application scenarios with lower reliability targets, different deployment models, such as active-passive or sharding, can be suitable alternatives. + +It's important to note that some Azure services are deployable or configurable as global resources, which are not constrained to a particular Azure region. Consequently, when accommodating both 'Scale-Unit Architecture' and 'Global Distribution', careful consideration should be given to how resources are optimally distributed across Azure regions. For example, the foundational reference implementation for AlwaysOn consists of both global and regional resources, with regional resources deployed across multiple regions to provide geo-availability, in the case of regional outages and to bring services closer to end-users. These regional deployments also serve as scale-unit "stamps" to provide additional capacity and availability when required. + +[![AlwaysOn Foundational-Online Architecture](/docs/media/alwayson-high-level-architecture.png "AlwaysOn Foundational-Online Architecture")](./App-Design.md) + +The previous image depicts the high-level active-active design for the [foundational-online reference implementation](https://github.com/azure/alwayson-foundational-online), where a user accesses the application via a central global entry point that then redirects requests to a suitable regional deployment stamp. + +### Design Considerations + +- Not all services or capabilities are available in every Azure region, and consequently there can be service availability implications depending on the selected deployment regions. + - For example, [Availability Zones](https://docs.microsoft.com/azure/availability-zones/az-region) are not available in every region. + +- Azure regions are grouped into [regional pairs](https://docs.microsoft.com/azure/best-practices-availability-paired-regions) consisting of two regions within the same geography. + - Some Azure services leverage paired regions to ensure business continuity and to protect against data loss. For example, Azure Geo-redundant Storage (GRS) replicates data to a secondary paired region automatically, ensuring that data is durable in the event that the primary region is not recoverable. + +- If an outage affects multiple Azure regions, at least one region in each pair will be prioritized for recovery. + +- The [Azure Safe Deploy Practice (SDP)](https://azure.microsoft.com/blog/advancing-safe-deployment-practices) ensures all code and configuration changes (planned maintenance) to the Azure platform undergo a phased roll-out, with health analyzed in case any degradation is detected during the release. + - Once the 'Canary' and 'Pilot' phases have been successfully completed, platform updates are serialized across regional pairs, ensuring that only one region in each pair is updated at a time. + +- Like any cloud provider, Azure ultimately has a finite amount of resources and as a result there are situations which can lead to the unavailability of capacity in individual regions. + - In the event of a regional outage there will be a significant increase in demand for resources within the paired region as impacted customer workloads seek to recover within the paired region. In certain scenarios this may create a capacity challenge where supply temporarily does not satisfy demand. + +- Compliance requirements around geographical data residency, data protection, and data retention can have a significant bearing on appropriate geographical distribution. + - *Are there specific regions where data must reside or where resources have to be deployed?* + +- The geographic proximity and density of users or dependent systems should inform design decisions around the global distribution of an AlwaysOn application. + - *Where are the requests physically originating from?* + +- The connectivity method by which users or systems access the application, whether over the public Internet or private networks leveraging either VPN or Express Route connectivity. + - *Are users going to connect from home and/or organizational networks?* + - *Can all users be expected to have fast internet connections?* + +- Different Azure regions have slightly different cost profiles for some services. There may be further cost implications depending on the precise deployment regions chosen. + +- Availability Zones have a latency perimeter of less than two milliseconds between availability zones. + - For workloads which are particularly 'chatty' across zones this latency can accumulate to form a non-trivial performance penalty, as well as incurring bandwidth charges for inter-zone data transfer. + + - An active-active deployment across Microsoft Azure and other cloud providers can be considered to further mitigate reliance on global dependencies within a single cloud provider. + - A multi-cloud active-active deployment strategy introduces a significant amount of complexity where CI/CD is concerned, particularly given the significant difference in resource specifications and capabilities between cloud providers, which necessitates specialised deployment stamps for each cloud. + +### Design Recommendations + +- Deploy the solution within a minimum of 2 Azure regions to protect against regional outages. + - Prioritize the use of paired regions to benefit from SDP risk mitigations and platform recovery capabilities. + > For scenarios targeting a >= 99.99% SLO, a minimum of 3 deployment regions should be used to maximize the composite SLA and overall reliability. + +- Use an active-active deployment strategy where possible to maximize reliability. + - Where data/state consistency challenges exist explore the use of a) a globally distributed data store, b) stamped regional architecture, or c) a partially active-active deployment, where some components are active across all regions while others are located centrally within a primary region. + +- Calculate the [composite SLA](https://docs.microsoft.com/azure/architecture/framework/resiliency/business-metrics#composite-slas) for all user flows. + - Ensure the composite SLA is in-line with business targets. + +- Deploy additional regional deployment stamps to achieve a greater composite SLA. + - The use of global resources will constrain the increase in composite SLA from adding further regions. + +- Define and validate that Recovery Point Objectives (RPO) and Recovery Time Objectives (RTO). + +- Geographically co-locate Azure resources with users to minimize network latency and maximize end-to-end performance. + - Technical solutions such as a Content Delivery Network (CDN) or edge caching can also be used to drive optimal network latency for distributed user bases. + +- For high-scale application scenarios with significant volumes of traffic, design the solution to scale across multiple regions to navigate potential capacity constraints within a single region. + +- Select deployment regions which offer requisite capabilities and characteristics to achieve performance and availability targets, while fulfilling data residency and retention requirements. + - Within a single geography, prioritize the use of regional pairs to benefit from SDP serialized rollouts for planned maintenance, and regional prioritization in the event of unplanned maintenance. + +- It is not uncommon that data compliance requirements will constrain the number of available regions and potentially force AlwaysOn design compromises. In such cases, additional investment in operational wrappers is highly recommended to predict, detect, and respond to failures. + - If only a single Azure region is suitable, multiple deployment stamps ('regional scale-units') should be deployed within the selected region to mitigate some risk, leveraging Availability Zones to provide datacenter-level fault tolerance. However, such a significant compromise in geographical distribution will drastically constrain the attainable composite SLA and overall reliability. + - If suitable Azure regions do not all offer requisite capabilities, be prepared to compromise on the consistency of regional deployment stamps to prioritize geographical distribution and maximize reliability. + - For example, when constrained to a geography with two regions where only one region supports Availability Zones (3 + 1 datacenter model), create a secondary deployment pattern using fault domain isolation to allow for both regions to be deployed in an active configuration, ensuring the primary region houses multiple deployment stamps. + +- Align current service availability with product roadmaps when selecting deployment regions; not all services may be available in every region on day 1. + +- Leverage Availability Zones where possible to maximize availability within a single Azure region. + +### Reference Global Distribution Approach + +The image below demonstrates how the the AlwaysOn reference application can be designed to scale across multiple Azure regions, with consideration given to scenarios where constraints on available regions necessitate multiple deployment stamps within a single region. + +[![AlwaysOn Global Distribution](/docs/media/alwayson-global-distribution.gif "AlwaysOn Global Distribution")](./App-Design.md) + +## Loose Coupled Event-Driven Architecture + +Loose coupling provides the cornerstone of a microservice architecture by allowing services to be designed in a way that each service has little or no knowledge of surrounding services. More specifically, it allows a service to operate independently ("loose") while still communicating with other services through well-defined interfaces ("coupling"), and in the context of AlwaysOn it further facilitates high-availability by preventing downstream failures from cascading to frontends or different deployment stamps. The following list captures the key characteristics of loose coupling, which should be evaluated when defining an AlwaysOn application design. + +- Services are not constrained to use the same compute platform, programming language, runtime, or operating system. +- Services can scale independently, optimizing the use of infrastructure and platform resources. +- Failures can be handled separately and do not affect client transactions. +- Transactional integrity is harder to maintain because data creation and persistence happens within separate services. +- End-to-end tracing requires more complex orchestration. + +When implementing loose coupling, **Event-driven architecture** and **asynchronous processing** are key design patterns which should be applied for interactions which do not require an immediate response. Events represent a change in state within entities and are generated by event *producers* (emitters). Producers do not know anything about how events should be processed or handled, since that is the responsibility of event *consumers*. When using asynchronous event-driven communication, a producer publishes an event when something happens within its domain which another component needs to be aware of, such as a price change in a product catalogue, which consumers will subscribe to receive so they can process the events asynchronously. + +[![Asynchronous Event-Driven Communication](/docs/media/alwayson-asynchronous-communication.png)](./App-Design.md) +*Image source: [Asynchronous Message-Based Communication](https://docs.microsoft.com/dotnet/architecture/microservices/architect-microservice-container-applications/asynchronous-message-based-communication)* + +In reality, applications can combine loose and tight-coupling, depending on business objectives. For example, the AlwaysOn reference implementation applies write operations asynchronously with a message bus and worker, while read operations are synchronous with the result directly returned to the caller. + +### Design Considerations + +- Not all content that a solution makes available over the Internet is generated dynamically. Applications serve both static assets (images, JavaScript, CSS, localization files, etc.) and dynamic content. + - Workloads with frequently accessed static content benefit greatly from caching since it reduces the load on backend services and reduces content access latency for end users. + +- Caching can be implemented natively within Azure using either Azure Front Door or Azure Content Delivery Network (CDN). + - [Azure Front Door](https://docs.microsoft.com/azure/frontdoor/front-door-caching) provides Azure-native edge caching capabilities as well as routing features to divide static and dynamic content. + - By creating the appropriate routing rules in Azure Front Door, `/static/*` traffic can be transparently redirected to static content. + - More complex caching scenarios can be implemented using the [Azure CDN](https://azure.microsoft.com/services/cdn) service to establish a full-fledged content delivery network for significant static content volumes. + - The Azure CDN service will likely be more cost effective, but does not provide the same advanced routing and Web Application Firewall (WAF) capabilities which are recommended for other areas of an AlwaysOn application design. It does, however, offer further flexibility to integrate with similar services from third-party solutions, such as Akamai and Verizon. + - When comparing the Azure Front Door and Azure CDN services, the following decision factors should be explored: + - Can required caching rules be accomplished using the rules engine. + - Size of the stored content and the associated cost. + - Price per month for the execution of the rules engine (charged per request on Azure Front Door). + - Outbound traffic requirements (price differs by destination). + +### Design Recommendations + +- Key functionality should be deployed and managed as independent loosely-coupled microservices with event-driven interaction through well-defined interfaces (synchronous and asynchronous). + - The definition of microservice boundaries should consider and align with critical user-flows. + +- Use event-driven asynchronous communication where possible to support sustainable scale and optimal performance. + +- Separate the delivery of static and dynamic content to users and deliver relevant content from a cache to reduce load on backend services optimize performance for end-users. + +- Given the strong recommendation (Network and Connectivity design area) to use Azure Front Door for global routing and WAF purposes, it is recommended to prioritize the use of Azure Front Door caching capabilities unless gaps exist. + +## Application-Level Resiliency Patterns and Error Handling + +A reliable application architecture is fundamental to maximizing reliability, however, if application code is not also developed with resiliency in-mind, overall reliability will be severely constrained. It is therefore critical that application code be designed and developed to be resilient, ensuring that the application can respond to failure, which is ultimately an unavoidable characteristic of highly distributed multi-tenant cloud environments like Azure. + +More specifically, all application components should be designed from the ground-up to apply key resiliency patterns for self-healing, such as [retries with back-off](https://docs.microsoft.com/dotnet/architecture/microservices/implement-resilient-applications/implement-http-call-retries-exponential-backoff-polly) and [circuit breaker](https://docs.microsoft.com/dotnet/architecture/microservices/implement-resilient-applications/implement-circuit-breaker-pattern). Such patterns go great lengths to transparently handle transient faults such as network packet loss, or the temporary loss of a downstream dependency. Ultimately, it is paramount that the application code cater for as many failure scenarios as possible in order to maximize service availability and reliability. + +When issues are not transient in-nature and cannot be fully mitigated within application logic, it becomes the role of the health model and operational wrappers to take corrective action. However, for this to happen effectively, it is essential that application code incorporate proper instrumentation and logging to inform the health model and facilitate subsequent troubleshooting or root cause analysis when required. More specifically, application code should be implemented to facilitate [Distributed Tracing](https://docs.microsoft.com/dotnet/core/diagnostics/distributed-tracing-concepts), by providing the caller with a comprehensive error message that includes a correlation ID when a failure occurs. + +> Tools like [Azure Application Insights](https://docs.microsoft.com/azure/azure-monitor/app/distributed-tracing) can help significantly to query, correlate, and visualize application traces. + +### Design Considerations + +- Vendor provided SDKs, such as the Azure service SDKs, will typically provide built-in resiliency capabilities like retry mechanisms. + +- It is not uncommon for application responses to transient issues to cause cascading failures. + - For example, retry without appropriate back-off will exacerbate when a service is being throttled will likely exacerbate the issue. + +- Retry delays can be linearly spaced, or increase exponentially to 'backoff' via growing delays. + +- [Queue-Based Load Leveling](https://docs.microsoft.com/azure/architecture/patterns/queue-based-load-leveling) introduces a buffer between consumers and requested resources to ensure consistent load levels. + - As consumer requests are enqueued, a worker process dequeues the requests and processes them against the requested resource at a pace set by the worker and the requested resource's ability to process the requests. + - If consumers expect replies to their requests, a separate response mechanism will also need to be implemented. + +- The [Circuit Breaker](https://docs.microsoft.com/azure/architecture/patterns/circuit-breaker) design pattern provides stability by either waiting for recovery, or quickly rejecting requests rather than blocking while waiting for an unavailable remote service or resource. + +- The [Bulkhead](https://docs.microsoft.com/azure/architecture/patterns/bulkhead) design pattern strives to partition service instances into groups based on load and availability requirements, isolating failures to sustain service functionality. + +- The [Saga](https://docs.microsoft.com/azure/architecture/reference-architectures/saga/saga) pattern can be used to manage data consistency across microservices with independent datastores by ensuring services update each other through defined event or message channels. + - Each service performs local transactions to update its own state and publishes an event to trigger the next local transaction in the saga. + - If a service update fails, the saga executes compensating transactions to counteract preceding service update steps. + - Individual service update steps can themselves implement resiliency patterns, such as retry. + +### Design Recommendations + +- Design and develop application code to anticipate and handle failures. + +- Use vendor provided SDKs, such as the Azure SDKs, to connect to dependent services. + - Leverage the resiliency capabilities provided by utilized SDKs instead of re-implementing resiliency functionality. + - Ensure a suitable back-off strategy is applied when retrying failed dependency calls to avoid a self-inflicted DDoS scenario. + +- Define **common engineering criteria** for all application microservice teams to drive consistency and acceleration regarding the use application-level resiliency patterns. + - Developers should familiarize themselves with [common software engineering patterns](https://docs.microsoft.com/azure/architecture/patterns/) for resilient applications. + +- Implement resiliency patterns using proven standardized packages, such as [Polly for C#](http://www.thepollyproject.org/) or [Sentinel for Java](https://github.com/alibaba/Sentinel). + +- Implement [Health Endpoint Monitoring](https://docs.microsoft.com/azure/architecture/patterns/health-endpoint-monitoring) by exposing functional checks within application code through health endpoints which external monitoring solutions can poll to retrieve application component health statuses. + - Responses should be interpreted alongside key operational metrics to inform application health and trigger a operational responses, such as raising an alert or performing a compensating roll-back deployment. + +- Implement [Queue-Based Load Leveling](https://docs.microsoft.com/azure/architecture/patterns/queue-based-load-leveling) to smooth out intermittent heavy loads which can cause requested resources to time out or fail. + - Apply a prioritized ordering so that the most important activities are performed first. + +- Implement the [Retry](https://docs.microsoft.com/azure/architecture/patterns/retry) pattern to enable application code to handle transient failures elegantly and transparently. + - Cancel if the fault is unlikely to be transient and is unlikely to succeed if the operation is re-attempted. + - Retry if the fault is unusual or rare and the operation is likely to succeed if attempted again immediately. + - Retry after a delay if the fault is caused by a condition that may need a short time to recover, such as network connectivity or high load failures. + - Apply a suitable 'backoff' strategy with growing retry delays. + +- Implement the [Circuit Breaker](https://docs.microsoft.com/azure/architecture/patterns/circuit-breaker) design pattern to handle faults that might take a variable amount of time to recover from when connecting to a remote service or resource. + +- Use [Throttling](https://docs.microsoft.com/azure/architecture/patterns/throttling) to control the consumption of resources used by application components, protecting them from becoming over encumbered. + - When a resource reaches a load threshold, it should safeguard its availability by deferring lower-importance operations and degrading non-essential functionality so that essential functionality can continue until sufficient resources are available to return to normal operation. + +- Consider the [Saga](https://docs.microsoft.com/azure/architecture/reference-architectures/saga/saga) pattern for scenarios where data consistency needs ensured across microservice boundaries. + - Roll back or compensate if one of the operations in the sequence fails. + +- Use correlation IDs for all trace events and log messages to tie them to a given request. + - Return correlation IDs to the caller for all calls not just failed requests. + +- Use [structured logging](https://stackify.com/what-is-structured-logging-and-why-developers-need-it/) for all log messages. + +- Select a unified operational data sink for application traces, metrics, and logs to enable operators to seamlessly debug issues. + - Ensure operational data is used in conjunction with business requirements to inform an [application health model](./Health-Modeling.md). + +--- + +|Previous Page|Next Page| +|--|--| +|[AlwaysOn Design Areas](./Design-Areas.md)|[Application Platform](./App-Platform.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/App-Platform.md b/docs/design-methodology/App-Platform.md new file mode 100644 index 00000000..a1e644ff --- /dev/null +++ b/docs/design-methodology/App-Platform.md @@ -0,0 +1,485 @@ +# Application Platform + +The selection of an appropriate application hosting platform is a critical decision which has reverberations across the AlwaysOn design areas and encompassed design decisions. Azure ultimately provides several highly available [computational platforms](https://docs.microsoft.com/azure/architecture/guide/technology-choices/compute-decision-tree) that differ in capability and complexity. It is therefore essential that key non-functional requirements surrounding reliability, availability, performance, and security are fully considered alongside other decision factors such as scalability, cost, operability, and complexity. For example, the scale-limits of a particular technology will have a key bearing on suitability as well as the overall AlwaysOn application design in terms of scale-unit definitions. + +This design area will therefore explore requisite decision factors and provide recommendations related to the selection, design, and configuration of an appropriate application hosting platform for an AlwaysOn application on Azure. + +- [Programming Language Selection](#programming-language-selection) +- [Containerization](#containerization) +- [Container Orchestration and Kubernetes](#container-orchestration-and-kubernetes) +- [Serverless Compute](#serverless-compute) +- [Asynchronous Messaging](#asynchronous-messaging) + +> An AlwaysOn application can use more than one application platform in parallel to support multiple composite workloads and microservices with distinct platform requirements. + +## Programming Language Selection + +Selecting the right programming languages and frameworks is a critical design decision for an AlwaysOn solution. Typically this decision is driven by the availability of development skills or by the use of standardized technologies within an organization. However, given the reliable aspirations of an AlwaysOn application, it is essential to also evaluate the performance and resilience aspects of different languages/frameworks as well as the capability differences within required Azure SDKs. + +### Design Considerations + +- There are sometimes significant differences in the capabilities offered by Azure service SDKs in different languages, and this may influence the selection of an Azure service or programming language. + - For instance, if Cosmos DB is a significant dependency, 'Go' may not be an appropriate development language since there is no first-party SDK. + +- New features are typically added to the .NET and Java libraries first, and there can be a delay in feature availability for other [supported languages](https://azure.github.io/azure-sdk/). + +- An AlwaysOn application can use multiple programming languages or frameworks in parallel to support multiple composite workloads with distinct requirements. + - However, significant technology sprawl should be avoided since it introduces management complexity and operational challenges. + +### Design Recommendations + +- Evaluate all relevant Azure SDKs to confirm requisite capabilities are available for selected programming languages, ensuring alignment with non-functional requirements. + +- Optimize the selection of programming languages and frameworks at the microservice level; embrace multiple technologies where appropriate. + - Avoid extensive technology sprawl to prevent unnecessary operational complexity. + +- Prioritize the .NET SDK to optimize reliability and performance since .NET Azure SDKs typically provide additional capabilities and receive new features first. + +## Containerization + +Containerization allows developers to create and deploy applications faster and more reliably by bundling application code together with related configuration files, libraries, and dependencies required for it to run. This single software package *container* runs on a shared kernel abstracted from the host operating system, and as a result is highly portable, capable of running consistently across different infrastructure platforms or cloud providers. + +### Design Considerations + +- Containerization has become a major trend in software development since it provides measurable benefits for developers and operations teams as well as optimizing infrastructure utilization. More specifically, the benefits of containerizing application components include: + - **Improved infrastructure utilization**: Containers do not include operating system images so require less system resources. Multiple containers can therefore be hosted on the same virtualized infrastructure, and this helps to optimize resource utilization by consolidating on fewer resources with higher container density. + - **Portability**: Including all software dependencies within the container ensures that it will work across different operating systems regardless of runtimes or library versions. Containerized applications are therefore easier to move between application platforms due to the standardized container format. + - **Faster scaling operations**: Containers are lightweight and do not suffer from the slow start-up and shutdown times afflicting virtual machines, and since container images are pre-built, the start-up activity can be minimized to focus only on bootstrapping the application. + - **Simplified management**: The consistent portability and ephemeral nature of containers provides a simplified infrastructure management experience compared to traditional virtualized hosting. + - **Agile development**: Containers support accelerated development, test, and production cycles through consistent operation and less overhead. +- The drawbacks of containerizing application components include: + - **Complex monitoring**: Monitoring services can find it harder to access applications running inside a container. Third-party software is typically required to collect and store container state indicators, such as CPU usage or RAM usage. + - **Security**: The hosting platform OS kernel is shared across multiple containers, creating a single point of attack. However, the risk of host VM access is limited since containers are isolated from the underlying operating system. + +- Containerization has proven to be an excellent option for packaging applications across different development languages, providing an abstraction layer for application code and its dependencies to achieve separation from the underlying hosting platform. + +- Containerization enables workloads to run on Azure without application code needing to be re-written. + - Provides a good opportunity to modernize legacy applications without significant code change, and can therefore be suitable for 'lift and shift' scenarios depending on the considered application frameworks and external dependencies. + +- While it is possible to store data within a running container's filesystem, the data will not persist when the container is recreated, so instead persistence is typically achieved by 'mounting' external storage. + +### Design Recommendations + +- Containerize all application components, using container images as the primary model for application deployment packages. +- Prioritize Linux-based container runtimes when possible. +- Avoid persisting state/data within a container since containers should be immutable and replaceable with short lifecycles. + +## Container Orchestration and Kubernetes + +There are several Azure application platforms capable of effectively hosting containers: + +- [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/services/kubernetes-service/) +- [Azure Container Instances (ACI)](https://azure.microsoft.com/services/container-instances/) +- [Azure App Service](https://azure.microsoft.com/services/app-service/containers/) +- [Azure Service Fabric](https://azure.microsoft.com/services/service-fabric/) +- [Azure Red Hat OpenShift](https://azure.microsoft.com/services/openshift/) + +There are advantages and disadvantages associated with each of these Azure container platforms which should be analyzed in the context of business requirements to inform an optimal technical choice; each platform serves an optimal choice for certain scenarios. However, given the principles underpinning the AlwaysOn design methodology strive to optimize reliability, scalability, and performance, it is strongly recommended to prioritize the use of Azure Kubernetes Service. + +Azure Kubernetes Service (AKS) is Microsoft Azure's native managed Kubernetes service which allows for rapid Kubernetes cluster provisioning without complex cluster administration activities, and enhances standard Kubernetes with a rich feature set that includes advanced networking and identity capabilities. It is important to note that AKS is the target application compute platform across Azure engineering teams within Microsoft, as well other Microsoft engineering organizations, such as Xbox and LinkedIn. + +For web and API based workload scenarios Azure App Services offers a feasible alternative to AKS, providing a low-friction container platform without the complexity of Kubernetes. + +> The design considerations and recommendations within this section will therefore focus on optimal AKS usage as well as App Services for low-scale scenarios. + +### Design Considerations + +**Azure Kubernetes Service** + +- There are many different container orchestrators, but Kubernetes has become the clear market leader and is best supported across the open source community and major cloud providers. + - Kubernetes expertise is readily available within the employment market. + - Kubernetes has a steep learning curve, so if development teams are new, it will require non-trivial engineering investment to set up and maintain an Kubernetes environment in a secure and reliable way. + - Kubernetes as well as managed Kubernetes offerings like AKS are widely available and can address concerns reg. vendor lock-in. + +- AKS provides a [control plane](https://kubernetes.io/docs/concepts/overview/components/) that is managed by Microsoft. + - By default the control plane of AKS is provided free of charge, but without any guaranteed SLA. + - Customers only manage and pay for the worker nodes which form the cluster. + +- The optional [AKS Uptime SLA](https://docs.microsoft.com/azure/aks/uptime-sla) provides availability guarantees for the Kubernetes control plane. + - 99.95% availability of the Kubernetes API server endpoint for AKS Clusters that use Azure Availability Zones. + - 99.9% availability for AKS Clusters that do not use Azure Availability Zones. + +- Some foundational configuration decisions have to be made upfront and cannot be changed without re-deploying an AKS cluster. + - Selection between public and private AKS clusters. + - Enabling Azure Network Policy. + - Azure AD integration and the use of Managed Identities for AKS instead of Service Principals. + +- AKS supports [three minor versions of Kubernetes](https://docs.microsoft.com/azure/aks/supported-kubernetes-versions). + - When a new minor version is introduced, the oldest supported minor version and patch releases are retired. + - The Kubernetes community releases minor versions roughly every three months. + - [AKS Kubernetes releases](https://docs.microsoft.com/azure/aks/supported-kubernetes-versions?tabs=azure-cli#aks-kubernetes-release-calendar) are aligned with the Kubernetes community and supported for 12 month. + +- AKS supports [updating node images](https://docs.microsoft.com/azure/aks/node-image-upgrade) to the newest OS and runtime versions without updating the Kubernetes version of the cluster or node pool. + - The AKS engineering team provides new images per week with the latest updates, including Linux or Windows patches. + +- AKS supports different [auto-upgrade channels](https://docs.microsoft.com/azure/aks/upgrade-cluster#set-auto-upgrade-channel) to upgrade AKS clusters to newer versions of Kubernetes and/or newer node images automatically once available. + - [Planned Maintenance](https://docs.microsoft.com/azure/aks/planned-maintenance) can be used to define maintenance windows for these operations. + +- AKS supports different network plugins. The [Azure CNI plugin](https://docs.microsoft.com/azure/aks/concepts-network#compare-network-models) is required to enable certain capabilities within AKS, such as Windows-based node pools or Kubernetes Network Policies. + +- AKS differentiates between [system node pools](https://docs.microsoft.com/azure/aks/use-system-pools#system-and-user-node-pools) and user node pools. + - AKS prefers scheduling system pods on node pools that are labelled as system. + - User node pools allow you to [scale down to 0](https://docs.microsoft.com/azure/aks/scale-cluster#scale-user-node-pools-to-0). + +- The [AKS Stop/Start cluster feature](https://docs.microsoft.com/azure/aks/start-stop-cluster) allows an AKS cluster in dev/test scenarios to be paused while maintaining cluster configuration, saving time and cost compared to re-provisioning. + +- Azure Monitor for containers (Container Insights) provides a seamless onboarding experience, enables various monitoring capabilities out of the box as well as more advanced scenarios via its built in [Prometheus scraping](https://docs.microsoft.com/azure/azure-monitor/insights/container-insights-prometheus-integration) support. + +- AKS offers integration with Azure AD to enable the use of Managed Identities for AKS as well as for node pool and individual pods, Role Based Access Control (RBAC) using Azure AD credentials as well as [authentication with Azure Container Registry (ACR)](https://docs.microsoft.com/azure/aks/cluster-container-registry-integration). + +- Default 'vanilla' Kubernetes requires significant configuration to ensure a suitable security posture for business-critical application scenarios. + - AKS addresses various security risks out of the box, such as support for private clusters, auditing and logging into Log Analytics, and hardened node images. + - Runtime Application Self-protection (RASP) tooling, such as Aquasec, can be deployed to AKS clusters to provide additional security functionality. + +- [Azure Policy](https://docs.microsoft.com/azure/aks/use-pod-security-on-azure-policy) can help to apply at-scale enforcements and safeguards to AKS clusters in a consistent centralized manner. + - Azure Policy can control what functions pods are granted, and if running contradicts policy. This access is defined through built-in policies provided by the [Azure Policy Add-on for AKS](https://docs.microsoft.com/azure/governance/policy/concepts/policy-for-kubernetes). + +- AKS has certain [scale limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#azure-kubernetes-service-limits), such as the number of nodes and number of node pools per cluster, as well as the number of clusters per subscription. + +- Using [Azure Spot VMs](https://azure.microsoft.com/services/virtual-machines/spot/) for AKS node pools takes advantage of un-utilized Azure capacity at a significant cost saving. + +**Azure App Service** + +- SNAT port exhaustion is a common failure scenario with Azure App Services, which can be predicted through load testing while monitoring ports using Azure Diagnostics. + - SNAT ports are used when making outbound connections to public IP addresses. + +- TCP port exhaustion is a further common failure scenario which occurs when the sum of outbound connections from a given worker exceeds the capacity. The number of available TCP ports depend on the size of the worker, as captured below in the following table: + + | |Small (B1, S1, P1, I1)|Medium (B2, S2, P2, I2)|Large (B3, S3, P3, I3)| + |---------|---------|---------|---------| + |TCP ports|1920|3968|8064| + +- Azure App Service has a default limit of 30 instances per App Service Plan. + - This default limit can be increased by opening a support ticket, if the App Service routinely uses 15 or more instances. + +- Per-app scaling can be enabled at the App Service Plan level to allow an application to scale independently from the App Service plan that hosts it. For example, an App Service Plan can be scaled to 10 instances, but an app can be set to use only 5. + - Apps are allocated to available nodes using a best effort approach for an even distribution. While an even distribution is not guaranteed, the platform will make sure that two instances of the same app will not be hosted on the same instance. + +- There are a number of events that can lead App Service workers to restart, such as content deployment, App Settings changes, and Virtual Network integration configuration changes. + +- App Service plan autoscale will scale-out run if any rule within the profile is met, but will only scale-in if all rules within the profile are met. + +- The App Service Premium (v3) Plan has a 20% discount versus comparable Pv2 configurations. + - Reserved Instance commitment (1Y, 3Y, Dev/Test) discounts are available for App Services running in the Premium v3 plan. + +- Diagnostic logging provides the ability to ingest application and platform level logs into either Log Analytics, Azure Storage, or a third party tool via Event Hub. + +- Application performance monitoring with Application Insights provides deep insights into application performance. + - For Linux Plans a code-based enablement (SDK) is required. + - For Windows Plans a 'codeless deployment' approach is possible to quickly get insights without changing any code. + +### Design Recommendations + +**Azure Kubernetes Service** + +- Use Azure Kubernetes Service (AKS) as the primary application hosting platform where requirements allow. + +- Deploy [AKS clusters across different Azure regions](https://docs.microsoft.com/azure/aks/operator-best-practices-multi-region#plan-for-multiregion-deployment) as a scale-unit to maximize reliability and availability. + +- Configure the use of AKS node pools to maximize reliability. + - Utilize Virtual Machine Scale Set (VMSS) with Standard Load Balancer (SLB) configuration format. + - Use [Availability Zones](https://docs.microsoft.com/azure/aks/availability-zones) to maximize resilience within an Azure region by distributing AKS control plane and agent nodes across physically separate datacenters. + - Where co-locality latency requirements exist, either a VMSS-based AKS deployment within a single zone or [proximity placement groups](https://docs.microsoft.com/azure/aks/reduce-latency-ppg) should be used to minimize inter-node latency. + - Ensure the System node pool is isolated from application workloads. + - Use dedicated node pools for infrastructure components and tools that require high resource utilization, such as Istio, to avoid noisy neighbor scenarios. + - Alternatively, ensure special scale or load behavior is defined. + - Separate distinct application workloads to dedicated node pools based on workload requirements, considering requirements for specialized infrastructure resources such as GPU, high memory VMs, or Spot VMs. + - Consider scaling user node pools to 0. + - Avoid deploying large numbers of node pools to reduce additional management overhead. + - Use [taints and tolerations](https://docs.microsoft.com/azure/aks/operator-best-practices-advanced-scheduler#provide-dedicated-nodes-using-taints-and-tolerations) to provide dedicated nodes and limit resource intensive applications. + - For high scale scenarios, consider the use of [Virtual Nodes](https://docs.microsoft.com/azure/aks/virtual-nodes-cli) ([vKubelet](https://github.com/virtual-kubelet/virtual-kubelet)) with ACI for extensive and rapid scale. + - Consider using [Azure Spot VMs](https://docs.microsoft.com/azure/aks/spot-node-pool) for low-criticality workloads that can handle interruptions, early terminations, or evictions. + - For example, development and testing environments may be good candidates for a spot node pool. + - Evaluate application affinity and anti-affinity requirements and configure the appropriate colocation of containers on nodes. + +- Avoid modifying resources within the [node resource group](https://docs.microsoft.com/azure/aks/faq#why-are-two-resource-groups-created-with-aks) ('MC_') + - If absolutely necessary, changes should only be done at [cluster creation time](https://docs.microsoft.com/azure/aks/faq#can-i-provide-my-own-name-for-the-aks-node-resource-group), or with assistance from Azure Support. + +- Enable [cluster autoscaler](https://docs.microsoft.com/azure/aks/cluster-autoscaler) to automatically adjust the number of agent nodes in response to resource constraints. + +- Utilize the [Horizontal pod autoscaler](https://docs.microsoft.com/azure/aks/concepts-scale#horizontal-pod-autoscaler) to adjust the number of pods in a deployment depending on CPU utilization or other selected metrics. + +- Define [pod resource requests and limits](https://docs.microsoft.com/azure/aks/developer-best-practices-resource-management#define-pod-resource-requests-and-limits) in application deployment manifests. + +- Utilize the [AKS Uptime SLA](https://docs.microsoft.com/azure/aks/uptime-sla) for production clusters to maximize Kubernetes API endpoint availability guarantees. + +- Ensure proper selection of network plugin based on network requirements and cluster sizing. + - Use [Azure Network Policies](https://docs.microsoft.com/azure/aks/use-network-policies) or Calico to control traffic between pods. + - This requires the CNI Network Plug-in. + +- Harden the AKS cluster to remove critical security risks associated with Kubernetes deployments. + - Use [Pod Identities](https://docs.microsoft.com/azure/aks/operator-best-practices-identity#use-pod-identities) and [Secrets Store CSI Driver](https://github.com/Azure/secrets-store-csi-driver-provider-azure#usage) with [Azure Key Vault](https://azure.microsoft.com/services/key-vault/) to protect secrets, certificates, and connection strings. + - Use [Managed Identities](https://docs.microsoft.com/azure/aks/use-managed-identity) to avoid having to manage and rotate service principal credentials. + - Utilize [AAD integration](https://docs.microsoft.com/azure/aks/managed-aad) to take advantage of centralized account management and passwords, application access management, and identity protection. + - Use Kubernetes RBAC with AAD for [least privilege](https://docs.microsoft.com/azure/aks/azure-ad-rbac), and minimize granting administrator privileges to protect configuration and secrets access. + - Limit access to the [Kubernetes cluster configuration](https://docs.microsoft.com/azure/aks/control-kubeconfig-access) file with Azure role-based access control. + - Limit access to [actions that containers can perform](https://docs.microsoft.com/azure/aks/developer-best-practices-pod-security#secure-pod-access-to-resources). + - Provide the least number of permissions, and avoid the use of root / privileged escalation. + - Evaluate the use of the built-in [AppArmor security module](https://docs.microsoft.com/azure/aks/operator-best-practices-cluster-security#app-armor) to limit actions that containers can perform such as read, write, or execute, or system functions such as mounting filesystems. + - Evaluate the use of the [seccomp (secure computing)](https://docs.microsoft.com/azure/aks/operator-best-practices-cluster-security#secure-computing), which works at the process level to limit the process calls that containers can perform. + +- Utilize [Azure Monitor and Application Insights](https://docs.microsoft.com/azure/azure-monitor/insights/container-insights-overview) to centrally collect metrics, logs, and diagnostics from AKS resources for troubleshooting purposes. + - Enable and review [Kubernetes master node logs](https://docs.microsoft.com/azure/aks/view-master-logs). + - Configure the [scraping of Prometheus metrics](https://docs.microsoft.com/azure/azure-monitor/insights/container-insights-prometheus-integration) with Azure Monitor for containers. + +- Subscribe to the public [AKS Roadmap and Release Notes](https://github.com/azure/aks) on GitHub to stay up-to-date on upcoming changes, improvements, and most importantly Kubernetes version releases or the deprecation of old releases. + +- Regularly upgrade to a supported version of Kubernetes. + - Establish a governance process to check and upgrade as needed to not fall out of support. + - Leverage the AKS Cluster auto-upgrade with Planned Maintenance. + - Regularly process node image updates to remain current with new AKS images. + +- Ensure AKS subscription [scale limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#azure-kubernetes-service-limits) are appropriately considered when designing the AKS deployment model to ensure requisite scalability. + +- Apply configuration guidance provided within the [AKS security baseline](https://docs.microsoft.com/security/benchmark/azure/baselines/aks-security-baseline). + +- Consider and apply the guidance provided within the [AKS checklist](https://www.the-aks-checklist.com/) to ensure alignment with Well-Architected best practice guidance. + +- Leverage a declarative deployment approach using either Terraform, ARM or BICEP. + - Ensure all deployments are repeatable and traceable within a source-code repository that can be combined with a GitOps based approach. + +- Store container images within [Azure Container Registry](https://azure.microsoft.com/services/container-registry/). + - Enable [geo-replication](https://docs.microsoft.com/azure/aks/operator-best-practices-multi-region#enable-geo-replication-for-container-images) to replicate container images across all leveraged AKS regions. + - Enable [Azure Defender for container registries](https://docs.microsoft.com/azure/security-center/defender-for-container-registries-introduction) to provide vulnerability scanning for container images. + - Authenticate using Azure AD to access Azure Container Registry. + + - Establish a consistent reliability and security baseline for AKS cluster and [pod](https://docs.microsoft.com/azure/aks/use-pod-security-on-azure-policy) configurations using [Azure Policy](https://docs.microsoft.com/azure/governance/policy/overview). + - Use the [Azure Policy Add-on for AKS](https://docs.microsoft.com/azure/governance/policy/concepts/policy-for-kubernetes) to control pod functions, such as root privileges, and disallow pods which do not conform to policy. + - Policy assignments should be enforced at a subscription scope or higher to drive consistency across development teams. + + > When deploying into an Azure Landing Zone, requisite Azure Policy to ensure the consistent reliability and security should be delivered by the Enterprise Scale platform through policy assignments within the Landing Zone definition. + + > The [foundational reference implementation](https://github.com/Azure/AlwaysOn/blob/main/docs/reference-implementation/README.md) provides a suite of baseline AlwaysOn policies to drive recommended reliability and security configurations. + +**Azure App Service** + +- For lower scale workload scenarios Azure App Services can provide a feasible alternative to AKS, without the complexities associated with administering Kubernetes. + - Consider and plan for future scalability requirements and application growth so that a strategic technology decision can be applied from the start, avoiding future technical migration debt as the solution grows. + - If the lack of requisite Kubernetes expertise presents an unacceptable delivery risk, consider Azure App Service as an alternative container platform. + +- Leverage the container based deployment model for App Service Plans. + +- Use Premium V3 plans with 2 or more worker instances for high availability. + +- Use Linux App Service Plans to optimize performance and costs. + +- Deploy App Service Plans in an [Availability Zone configuration](https://azure.github.io/AppService/2021/08/25/App-service-support-for-availability-zones.html) to ensure worker nodes are distributed across zones within a region. + +- Deploy Azure App Service Plans across multiple regions as a scale unit, using multiple scale-units deployed within a single region to navigate the default limit of 30 instances per App Service Plan. + - Consider opening a support ticket to increase the maximum number of workers to twice the instance count required to serve normal peak load. + +- Evaluate the use of TCP and SNAT ports to avoid outbound connection errors. + - Predictively detect SNAT port exhaustion through load testing while monitoring ports using Azure Monitor, and if SNAT errors occur, it is necessary to either scale across more/larger workers, or implement coding practices to help preserve and re-use SNAT ports, such as connection pooling or the lazy loading of resources. + + - It is recommended not to exceed 100 simultaneous outbound connections to a public IP Address per worker, and to avoid communicating with downstream services via public IP addresses when a [Private Endpoint](https://docs.microsoft.com/azure/private-link/private-endpoint-overview) or [Service Endpoint](https://docs.microsoft.com/azure/virtual-network/virtual-network-service-endpoints-overview) could be used. + +- Avoid unnecessary worker restarts. + - Make changes within a deployment slot other than the slot currently configured to accept production traffic. After workers are recycled and warmed up, a 'swap' can be performed without unnecessary down time. + +- Enable [AutoHeal](https://azure.github.io/AppService/2021/04/21/Announcing-Autoheal-for-Azure-App-Service-Linux.html) to automatically recycle unhealthy workers. + +- Enable [Health Check](https://aka.ms/appservicehealthcheck) to identify non-responsive workers. + - While any health check is better than none at all, the logic behind endpoint tests should assess all critical downstream dependencies to ensure overall health. + +- Enable [AutoScale](https://docs.microsoft.com/azure/azure-monitor/platform/autoscale-get-started?toc=/azure/app-service/toc.json) to ensure adequate resources are available to service requests. + - Use a scale-out and scale-in rule combination to ensure auto-scale can take action to both scale-out and scale-in. + - Understand the behavior of multiple scaling rules in a single profile. + +- Enable [Diagnostic Logging](https://docs.microsoft.com/Azure/app-service/troubleshoot-diagnostic-logs) to provide insight into application and platform behavior. + +- Enable [Application Insights Alerts](https://docs.microsoft.com/Azure/azure-monitor/app/azure-web-apps) to be made aware of fault conditions. + +- Review [Azure App Service diagnostics](https://docs.microsoft.com/azure/app-service/overview-diagnostics) to ensure common problems are addressed. + - It is a good practice to regularly review service-related diagnostics and recommendations, taking action as appropriate. + +- Evaluate [per-app scaling](https://docs.microsoft.com/azure/app-service/manage-scale-per-app) for high-density hosting on Azure App Service Plans. + +## Serverless Compute + +Serverless computing provides compute resources on-demand and eliminates the need to manage infrastructure all together; the cloud provider automatically provisions, scales, and manages resources required to run deployed application code. + +Microsoft Azure provides several serverless compute platforms: + +- [Azure Functions](https://docs.microsoft.com/azure/azure-functions/functions-overview): Application logic is implemented as distinct blocks of code ("functions") which run in response to events, such as a HTTP request or queue message, with each function scaling as necessary to meet demand. +- [Azure Logic Apps](https://docs.microsoft.com/azure/logic-apps/logic-apps-overview): Platform for creating and running automated workflows that integrate various apps, data sources, services and systems. Similar to Azure Functions, there are built-in triggers for event-driven processing but instead of deploying application code, Logic Apps can be composed using a graphical user interface which supports code blocks like conditionals and loops. +- [Azure API Management](https://azure.microsoft.com/services/api-management/): Publish, secure, transform, maintain, and monitor APIs using the Consumption tier. +- [Power Apps & Power Automate](https://docs.microsoft.com/powerapps/powerapps-overview): Provides a 'low-code/no-code' development experience, with simple workflow logic and integrations configurable through connections in a user interface. Developed Power Apps can subsequently be deployed to a Microsoft 365 tenant and consumed from either a web browser or the Power Apps mobile client. + +In the context of a reliable AlwaysOn application platform, serverless technologies provide a near-zero friction development and operational experience, which can be highly valuable for simple business process scenarios. However, this relative simplicity comes at the cost of flexibility in terms of scalability, reliability, and performance, which is likely unacceptable for most business-critical application scenarios. + +The AlwaysOn design methodology therefore positions serverless technologies as an alternative platform for simple business process scenarios which do not share the same stringent business requirements as critical system flows. + +> The design considerations and recommendations within this section will focus on optimal Azure Function and Azure Logic Apps usage as alternative platforms for non-critical workflow scenarios. + +### Design Considerations + +**Azure Functions** + +- In most cases Azure Functions do not require additional code to call external services or to enable external events trigger function execution since these can be achieved with Azure Function [Bindings](https://docs.microsoft.com/azure/azure-functions/functions-triggers-bindings). + - Azure Functions supports multiple triggers, such as the HTTP trigger, and bindings for Azure Services, such as Azure Cosmos DB, Azure Service Bus and Azure Blob Storage. + +- There are 3 [hosting plans](https://docs.microsoft.com/azure/azure-functions/functions-scale) available for Azure Functions: + - *Consumption* is the fully serverless pay-per-use option, with instances dynamically added and removed based on the number of incoming events; underlying compute resources are charged only when running. + - *Premium* uses a Premium SKU App Service plan to host functions and allows the configuration of compute instance size. Additionally, it is possible to set up a number of pre-warmed instances to eliminate cold starts. + - There will always be at least one billed instance in the Premium plan. + - *Dedicated* is the least serverless option as it is tied to a provisioned App Service plan or App Service Environment. Autoscale can be enabled, but scale operations are slower than with the Consumption and Premium plans. + +- Fully serverless hosting options, which help optimize costs by de-provisioning allocated resources when workloads are not running, may incur "cold start" delays, especially for applications comprised of many files, such as Node.js or PHP applications. + +- [Azure Function Premium](https://docs.microsoft.com/azure/azure-functions/functions-premium-plan#region-max-scale-out) has a limit of 100 instances in the Windows tier and 20 instances in the Linux tier. + +**Azure Logic Apps** + +- There are 3 [deployment modes](https://docs.microsoft.com/azure/logic-apps/single-tenant-overview-compare) available for Azure Logic Apps: + - *Consumption* is the fully serverless pay-per-use model, with Azure managing the infrastructure which is shared across multiple tenants. + - A single logic app can have only one workflow. + - *Consumption (ISE)* uses the dedicated Integration Service Environment (ISE) to privately host logic apps. + - A single logic app can have only one workflow. + - *Standard* uses the containerized single-tenant Azure Logic Apps runtime based on Azure Functions. + - Each logic app can have multiple stateful and stateless workflows. + +- Similar to Azure Functions, there are built-in triggers for event-driven processing, however, instead of deploying application code Logic Apps can be composed using a graphical user interface which supports blocks like conditionals, loops etc. + +- With the standard deployment model, default limits can be modified, however, some limits have upper maximums. + - For consumption plans Azure manages the default configuration limits, but some values can be changed through the creation of a support ticket. + +### Design Recommendations + +**Azure Functions** + +- Consider Azure Functions for simple business process scenarios which do not have the same stringent business requirements as business-critical system flows. + - Low-critical scenarios can also be hosted as separate containers within AKS to drive consistency, provided affinity and anti-affinity requirements are fully considered when collocating containers on nodes. + +- Azure Functions should perform distinct operations that run as fast as possible. + - This makes them very flexible and highly-scalable, ensuring they will work well in event-driven workloads with short-lived processes. + +- Use the Premium Function Linux hosting plan to maximize reliability and performance while maintaining the serverless promise. + +- Take a scale-unit approach to navigate the resource limit of 20 Linux nodes. + +- When using the HTTP trigger to expose an external endpoint, protect the HTTP endpoint from common external attack vectors using a Web Application Firewall (WAF). + +- For internal workloads, consider the use of Service Endpoints or Private Endpoints to restrict access to private Virtual Networks. + - If required, use Private Endpoints to mitigate data exfiltration risks, such as malicious admin scenarios. + +- Treat Azure Functions code just like any other code; subject it to code scanning tools that are integrate it with AlwaysOn CI/CD pipelines. + +**Azure Logic Apps** + +- Use the standard deployment mode to ensure a single tenant deployment and mitigate 'noisy neighbor' scenarios. + +## Asynchronous Messaging + +The recommendation for a loosely-coupled microservice architecture relies heavily on asynchronous messaging to provide requisite integration between application components through a well-defined message bus. Azure provides several native messaging services which can be used to facilitate asynchronous message interfaces, including **Azure Event Hub**, **Azure Service Bus**, **Azure Storage Queues** and **Azure Event Grid**. + +This section will therefore explore key decision factors when selecting an appropriate Azure message service, and how to optimally configure each service in the context of an mission-critical workload scenario to maximize reliability. + +### Design Considerations + +- More than one type of messaging service can be used by an application for different workload scenarios. + +- There are many factors to consider when selecting an optimal messaging service for a specific scenario. + - [Asynchronous Messaging on Azure](https://docs.microsoft.com/azure/architecture/guide/technology-choices/messaging) + - [Azure Storage Queues vs Azure Service Bus Queues](https://docs.microsoft.com/azure/service-bus-messaging/service-bus-azure-and-service-bus-queues-compared-contrasted) + - [Comparison between Event Grid, Event Hubs and Service Bus](https://docs.microsoft.com/azure/event-grid/compare-messaging-services) + - [Azure Messaging Services -How to Choose the Right Messaging Technology in Azure](https://medium.com/walmartglobaltech/azure-messaging-services-how-to-choose-the-right-messaging-technology-in-azure-eab610b5e986) + +**Azure Event Hub** + +- Azure Event Hubs are designed as a **event streaming** service to scale to multi million messages (i.e. "events") per second. + +- Event Hubs support messages sizes of up to 1 MB. + +- Event Hubs supports Availability Zones (AZs) for zonal redundancy within supported regions in all pricing tiers. + +- Azure Event Hub Namespaces support configurable failover replication between regions, however, only configuration metadata is replicated, but **not messages themselves**. + - Configuration that is being replicated includes: Event Hubs (inside a Namespace), Consumer Groups, Namespace Authorization Rules, and Event Hubs Authorization Rules. + +- Azure Event Hubs can be configured to [capture raw data into Azure Blob Storage or Azure Data Lake Storage](https://docs.microsoft.com/azure/event-hubs/event-hubs-capture-overview) + - Event Hub Capture to Azure Storage or Data Lake Storage records the captured data within .avro file. + +- Data can be processed from Event Hubs by a client using the [Event Processor SDK](https://docs.microsoft.com/azure/event-hubs/event-processor-balance-partition-load), or by [Azure Stream Analytics](https://docs.microsoft.com/azure/event-hubs/process-data-azure-stream-analytics). + +- Event Hubs can be deployed as a dedicated-tier Event Hubs cluster for high throughput and a 99.99% SLA. + - Premium tier also offers a 99.99% SLA. + - The Basic and Standard tiers for a single tenant provide a 99.95% SLA. + +- Event Hubs provide a Apache Kafka-compatible [messaging interface](https://docs.microsoft.com/azure/event-hubs/event-hubs-for-kafka-ecosystem-overview). + +**Azure Service Bus** + +- Azure Service Bus provides **reliable asynchronous message delivery** that requires polling. + +- Service Bus supports Availability Zones (AZs) for zonal redundancy within supported regions in Premium tier. + +- Service Bus supports messages sizes of 256 KB in Basic and Standard tier and 1 MB in Premium tier. + +- Service Bus offers a SLA of 99.9% on send and receive operations. + +**Azure Storage Queues** + +- Azure Storage Queues provide a simple messaging solution, which can be communicated with using a REST API. + +- Storage Queues support message sizes of up to 64 KB. + +- Storage queues do not guarantee message ordering. + +- Maximum throughput is limited to up to 20,000 messages per second (assuming 1 KB message size) per storage account. + - Target throughput for a single queue (1 KB messages) is limited to 2,000 messages per second. + - Throttling will occur if this limit is reached. + +- Storage queues support a size of up to 500 TB. + +- Through the geo-replication feature of Azure Storage Accounts, Storage Queues can be configured to asynchronously replicate to another region. + +- Storage Queues provide the same SLA as their underlying Storage Accounts. + - 99.99% SLA on read requests for RA-GRS and 99.9% for write requests. + +**Azure Event Grid** + +- Azure Event Grid is designed as a **event distribution** service for reactive programming. + +- Event Grid integrates with many Azure services as event sources. + - For example, Event Grid can be configured to react to status changes on Azure resources. + +- Event Grid supports messages sizes of up to 1 MB. + +- Event Grid provides a SLA of 99.99% for message publishing. + +### Design Recommendations + +- Prioritise the use of Event Hub for scenarios which require high throughput and which can work with message ordering on a partition-basis. + +- Use Service Bus for scenarios requiring a higher QoS and message guarantee by implementing two phase commits. + - Use Service Bus Premium in an zonal redundant configuration to provide high-availability within a region. + +- Consider Storage Queues when message geo-replication is required provided the message size is less than 64 KB. + - Use an AZ-redundant tier for the underlying Storage Account (ZRS or GZRS). + +- Use Event Grid for scenarios where services need to react to changes in another service/component. + +--- + +Previous Page|Next Page| +|--|--| +|[Application Design](./App-Design.md)|[Data Platform](./Data-Platform.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Data-Platform.md b/docs/design-methodology/Data-Platform.md new file mode 100644 index 00000000..58b6e808 --- /dev/null +++ b/docs/design-methodology/Data-Platform.md @@ -0,0 +1,803 @@ +# Data platform + +The selection of an effective application data platform is a further crucial decision area which has far-reaching implications across other AlwaysOn design areas. Azure ultimately offers a multitude of relational, non-relational, and analytical data platforms which differ greatly in capability. It is therefore essential that key non-functional requirements be fully considered alongside other decision factors such as consistency, operability, cost, and complexity. For example, the ability to operate in a multi-write configuration will have a critical bearing on suitability for a globally available platform. + +This section will therefore expand on the AlwaysOn application design, providing key considerations and recommendations aligned with critical design themes to inform the selection of an optimal data platform. + +- [The Four Vs of Big Data](#the-four-vs-of-big-data) +- [Globally Distributed Multi-Write Datastore](#globally-distributed-multi-write-datastore) +- [Relational Data Technologies](#relational-data-technologies) +- [Caching for Hot Tier Data](#caching-for-hot-tier-data) +- [Analytical Scenarios](#analytical-scenarios) + +## The Four Vs of Big Data + +The 'Four Vs of Big Data' provide a framework to better understand requisite characteristics for an AlwaysOn data platform, and how data can be used to maximize business value. This section will therefore explore how the Volume, Velocity, Variety, and Veracity characteristics can be applied at a conceptual level to help design an AlwaysOn data platform using appropriate data technologies. + +- **V**olume: how much data is coming in to inform storage capacity and tiering requirements - i.e., the size of the dataset. +- **V**elocity: the speed at which data is processed, either as batches or continuous streams - i.e., the rate of flow. +- **V**ariety: the organization and format of data, capturing structured, semi-structured, and unstructured formats - i.e., data across multiple stores or types. +- **V**eracity: includes the provenance and curation of considered data sets for governance and data quality assurance - i.e., accuracy of the data. + +### Design Considerations + +**Volume** + +- Existing (if any) and expected future data volumes based on forecasted data growth rates aligned with business objectives and plans. + - Data volume should encompass the data itself as well as indexes, logs, telemetry, and other applicable datasets. + - Large business-critical and mission-critical applications typically generate and store high volumes (GB and TB) on a daily basis. + - There can be significant cost implications associated with data expansion. + +- Data volume may fluctuate due to changing business circumstances or housekeeping procedures. + +- Data volume can have a profound impact on data platform query performance. + +- There can be a profound impact associated with reaching data platform volume limits. + - *Will it result in downtime? and if so, for how long?* + - *What are the mitigation procedures? and will mitigation require application changes?* + - *Will there be a risk of data loss?* + +- Features such as Time to Live (TTL) can be used to manage data growth by automatically deleting records after an elapsed time, using either record creation or modification. + - For example, Azure Cosmos DB provides a in-built [TTL](https://docs.microsoft.com/azure/cosmos-db/sql/time-to-live) capability. + +**Velocity** + +- The speed with which data is emitted from various application components, and the throughput requirements for how fast data needs to be committed and retrieved are critical to determining an optimal data technology for key workload scenarios. + - The nature of throughput requirements will differ by workload scenario, such as those that are read-heavy or write-heavy. + - For example, analytical workloads will typically need to cater to a large read throughput. + - *What is the required throughput? And how is throughput expected to grow?* + - *What are the data latency requirements at P50/P99 under reference load levels?* + +- Capabilities such as supporting a lock-free design, index tuning, and consistency policies are critical to achieving high-throughput. + - Optimizing configuration for high throughput incurs trade-offs which should be fully understood. + - Load-levelling persistence and messaging patterns, such as CQRS and Event Sourcing, can be used to further optimize throughput. + +- Load levels will naturally fluctuate for many application scenarios, with natural peaks requiring a sufficient degree of elasticity to handle variable demand while maintaining throughput and latency. + - Agile scalability is key to effectively support variable throughput and load levels without overprovisioning capacity levels. + - Both read and write throughput must scale according to application requirements and load. + - Both vertical and horizontal scale operations can be applied to respond to changing load levels. + +- The impact of throughput dips can vary based on workload scenario. + - *Will there be connectivity disruption?* + - *Will individual operations return failure codes while the control plane continues to operate?* + - *Will the data platform activate throttling, and if so for how long?* + +- The fundamental AlwaysOn application design recommendation to use an active-active geographical distribution introduces challenges where data consistency is concerned. + - There is a trade-off between consistency and performance with regards to full ACID transactional semantics and traditional locking behavior. + - Minimizing write latency will come at the cost of data consistency. + +- In a multi-region write configuration, changes will need to be synchronized and merged between all replicas, with conflict resolution where required, and this may impact performance levels and scalability. + +- Read-only replicas (intra-region and inter-region) can be used to minimize roundtrip latency as well as distributing traffic to boost performance, throughput, availability, and scalability. + +- A caching layer can be used to increase read throughput to improve user experience and end-to-end client response times. + - Cache expiration times and policies need to be considered to optimize data recentness. + +**Variety** + +- The data model, data types, data relationships, and intended query model will strongly affect data platform decisions. + - *Does the application require a relational data model or can it support a variable-schema or non-relational data model?* + - *How will the application query data? And will queries depend on database-layer concepts such as relational joins? Or does the application provide such semantics?* + +- The nature of datasets considered by the application can be varied, from unstructured content such as images and videos, or more structured files such as CSV and Parquet. + - Composite application workloads will typically have distinct datasets and associated requirements. + +- In addition to relational or non-relational data platforms, graph or key-value data platforms may also be suitable for certain data workloads. + - Some technologies cater to variable-schema data models, where data items are semantically similar and/or stored and queried together but differ structurally. + +- In a microservice architecture, individual application services can be built with distinct scenario-optimized datastores rather than depending on a single monolithic datastore. + - Design patterns such as [SAGA](https://docs.microsoft.com/azure/architecture/reference-architectures/saga/saga) can be applied to manage consistency and dependencies between different datastores. + - Inter-database direct queries can impose co-location constraints. + - The use of multiple data technologies will add a degree of management overhead to maintain encompassed technologies. + +- The feature-sets for each Azure service differ across languages, SDKs, and APIs, which can greatly impact the level of configuration tuning that can be applied. + +- Capabilities for optimized alignment with the data model and encompassed data types will strongly influence data platform decisions. + - Query layers such as stored procedures and object-relational mappers. + - Language-neutral query capability, such as a secured REST API layer. + - Business continuity capabilities, such as backup and restore. + +- Analytical datastores typically support polyglot storage for various types of data structures. + - Analytical runtime environments, such as Apache Spark, may have integration restrictions to analyze polyglot data structures. + +- In an enterprise context, the use of existing processes and tooling, as well as the continuity of skills, can have a significant bearing on the data platform design and selection of data technologies. + +**Veracity** + +- Several factors must be considered to validate the accuracy of data within an AlwaysOn application, and the management of these factors can have a significant bearing on the design of the data platform. + - Data consistency. + - Platform security features. + - Data governance. + - Change management and schema evolution. + - Dependencies between datasets. + +- In any distributed application with multiple data replicas there is a trade-off between consistency and latency, as expressed in the [CAP](https://en.wikipedia.org/wiki/CAP_theorem) and [PACELC](https://en.wikipedia.org/wiki/PACELC_theorem) theorems. + - When readers and writers are distinctly distributed, an application must choose to return either the fastest-available version of a data item, even if it is out of date compared to a just-completed write (update) of that data item in another replica, or the most up-to-date version of the data item, which may incur additional latency to determine and obtain the latest state. + - Consistency and availability can be configured at platform level or at individual data request level. + - *What is the user experience if data was to be served from a replica closest to the user which does not reflect the most recent state of a different replica? i.e. can the application support possibly serving out-of-date data?* + +- In a multi-region write context, when the same data item is changed in two separate write-replicas before either change can be replicated, a conflict is created which must be resolved. + - Standardized conflict resolution policies, such as "Last One Wins", or a custom strategy with custom logic can be applied. + +- The implementation of security requirements may adversely impact throughput or performance. + +- Encryption at-rest can be implemented in the application layer using client-side encryption and/or the data layer using server-side encryption if required. + +- Azure supports various [encryption models](https://docs.microsoft.com/azure/security/fundamentals/encryption-overview), including server-side encryption that uses service-managed keys, customer-managed keys in Key Vault, or customer-managed keys on customer-controlled hardware. + - With client-side encryption, keys can be managed in Key Vault or another secure location. + +- MACsec (IEEE 802.1AE MAC) data-link encryption is used to secure all traffic moving between Azure datacenters on the Microsoft backbone network. + - Packets are encrypted and decrypted on the devices before being sent, preventing physical 'man-in-the-middle' or snooping/wiretapping attacks. + +- Authentication and authorization to the data plane and control plane. + - *How will the data platform authenticate and authorize application access and operational access?* + +- Observability through monitoring platform health and data access. + - *How will alerting be applied for conditions outside acceptable operational boundaries?* + +### Design Recommendations + +**Volume** + +- Ensure future data volumes associated with organic growth are not expected to exceed data platform capabilities. + - Forecast data growth rates aligned to business plans and use established rates to inform ongoing capacity requirements. + - Compare aggregate and per-data record volumes against data platform limits. + - If there is a risk of limits being reached in exceptional circumstances, ensure operational mitigations are in place to prevent downtime and data loss. + +- Monitor data volume and validate it against a capacity model, considering scale limits and expected data growth rates. + +- Define application data tiers to classify datasets based on usage and criticality to facilitate the removal or offloading of older data. + - Consider classifying datasets into 'hot', 'warm', and 'cold' ('archive') tiers. + - For example, the foundational reference implementations leverage Cosmos DB to store 'hot' data which is actively used by the application, while Azure Storage is used for 'cold' operations data for analytical purposes. + +- Configure housekeeping procedures to optimize data growth and drive data efficiencies, such as query performance, as well as managing data expansion. + - Configure Time-To-Live (TTL) expiration for data which is no-longer required and has no long-term analytical value. + - Validate that old data can be safely tiered to secondary storage, or deleted outright, without an adverse impact to the application. + - Offload non-critical data to secondary cold storage, but maintain it for analytical value and to satisfy audit requirements. + - Collect data platform telemetry and usage statistics to enable DevOps teams to continually evaluate housekeeping requirements and 'right-size' datastores. + +- In-line with a microservice application design, consider the use of multiple different data technologies in-parallel, with optimized data solutions for specific workload scenarios and volume requirements. + - Avoid creating a single monolithic datastore where data volume from expansion can be hard to manage. + +**Velocity** + +- The data platform must inherently be designed and configured to support high-throughput, with workloads separated into different contexts to maximize performance using scenario optimized data solutions. + - Ensure read and write throughput for each data scenario can scale according to expected load patterns, with sufficient tolerance for unexpected variance. + - Separate different data workloads, such as transactional and analytical operations, into distinct performance contexts. + +- Load-level through the use of asynchronous non-blocking messaging, for example using the [CQRS](https://docs.microsoft.com/azure/architecture/patterns/cqrs) or [Event Sourcing](https://docs.microsoft.com/azure/architecture/patterns/event-sourcing) patterns. + - There might be latency between write requests and when new data becomes available to read, which may have an impact on the user experience. + - This impact must be understood and acceptable in the context of key business requirements. + +- Ensure agile scalability to support variable throughput and load levels. + - If load levels are highly volatile, consider overprovisioning capacity levels to ensure throughput and performance is maintained. + - Test and validate the impact to composite application workloads when throughput cannot be maintained. + +- Prioritize Azure-native data services with automated scale-operations to facilitate a swift response to load-level volatility. + - Configure autoscaling based on service-internal and application-set thresholds. + - Scaling should initiate and complete in timeframes consistent with business requirements. + - For scenarios where manual interaction is necessary, establish automated operational 'playbooks' that can be triggered rather than conducting manual operational actions. + - Consider whether automated triggers can be applied as part of subsequent engineering investments. + +- Monitor application data read and write throughput against P50/P99 latency requirements and align to an application capacity model. + +- Excess throughput should be gracefully handled by the data platform or application layer and captured by the health model for operational representation. + +- Implement caching for 'hot' data scenarios to minimize response times. + - Apply appropriate policies for cache expiration and house-keeping to avoid runaway data growth. + - Expire cache items when the backing data changes. + - If cache expiration is strictly Time-To-Live (TTL) based, the impact and customer experience of serving outdated data needs to be understood. + +**Variety** + +- In alignment with the principle of a cloud- and Azure-native design, it is highly recommended to prioritize managed Azure services to reduce operational and management complexity, as well as taking advantage of Microsoft's future platform investments. + +- In alignment with the application design principle of loosely coupled microservice architectures, allow individual services to use distinct data stores and scenario-optimized data technologies. + - Identify the types of data structure the application will handle for specific workload scenarios. + - Avoid creating a dependency on a single monolithic datastore. + - Consider the [SAGA](https://docs.microsoft.com/azure/architecture/reference-architectures/saga/saga) design pattern where dependencies between datastores exist. + +- Validate that required capabilities are available for selected data technologies. + - Ensure support for required languages and SDK capabilities. Not every capability is available for every language/SDK in the same fashion. + +- Validate technology scale-limits and define scale-units to align with expected growth rates. + - Ensure scale operations align with storage, performance, and consistency requirements. + - When a new scale-unit is introduced, underlying data may need to be replicated which will take time and likely introduce a performance penalty while replication occurs. So ensure these operations are performed outside of critical business hours if possible. + +**Veracity** + +- Adopt a multi-region data platform design and distribute replicas across regions for maximum reliability, availability, and performance by moving data closer to application endpoints. + - Distribute data replicas across Availability Zones (AZs) within a region (or use zone-redundant service tiers) to maximize intra-region availability. + +- Where consistency requirements allow for it, use a multi-region write data platform design to maximize overall global availability and reliability. + - Consider business requirements for conflict resolution when the same data item is changed in two separate write replicas before either change can be replicated and thus creating a conflict. + - Use standardized conflict resolution policies such as "Last one wins" where possible + - If a custom strategy with custom logic is required, ensure CI/CD DevOps practices are applied to manage custom logic. + +- Test and validate backup and restore capabilities, as well as failover operations through chaos testing within continuous delivery processes. + +- Run performance benchmarks to ensure throughput and performance requirements are not impacted by the inclusion of required security capabilities, such as encryption. + - Ensure continuous delivery processes consider load testing against known performance benchmarks. + +- When applying encryption, it is strongly recommended to use service-managed encryption keys as a way of reducing management complexity. + - If there is a specific security requirement for customer-managed keys, ensure appropriate key management procedures are applied to ensure availability, backup, and rotation of all considered keys. + +> In an Enterprise Scale context, it is critical that an application centric approach be applied for the provisioning and operation of data platform components in an AlwaysOn application design. +> More specifically, to maximize reliability it is critical that individual data platform components appropriately respond to application health through operational actions which may include other application components. For example, in a scenario where additional data platform resources are needed, scaling the data platform along with other application components according to a capacity model will likely be required, potentially through the provision of additional scale units. This approach will ultimately be constrained if there is a hard dependency of a centralized operations team to address issues related to the data platform in isolation. +> Ultimately, the use of centralized data services (i.e. Central IT DBaaS) introduces operational bottlenecks that significantly hinder agility through a largely uncontextualized management experience, and should be avoided in a mission-critical or business-critical context. + +### Additional references + +Additional data-platform guidance is available within the Azure Application Architecture Guide. + +- [Azure Data Store Decision Tree](https://docs.microsoft.com/azure/architecture/guide/technology-choices/data-store-decision-tree) +- [Criteria for choosing a Data Store](https://docs.microsoft.com/azure/architecture/guide/technology-choices/data-store-considerations) +- [Non-Relational Data Stores](https://docs.microsoft.com/azure/architecture/data-guide/big-data/non-relational-data) +- [Relational OLTP Data Stores](https://docs.microsoft.com/azure/architecture/data-guide/relational-data/online-transaction-processing) + +## Globally distributed multi-write datastore + +To fully accommodate the globally distributed active-active aspirations of an AlwaysOn application design, it is strongly recommended to consider a distributed multi-write data platform, where changes to separate writeable replicas are synchronized and merged between all replicas, with conflict resolution where required. + +> It is important to note that encompassed microservices may not all require a distributed multi-write datastore, so consideration should be given to the architectural context and business requirements of each workload scenario. + +Azure Cosmos DB provides a globally distributed and highly available NoSQL datastore, offering multi-region writes and tunable consistency out-of-the-box. The design considerations and recommendations within this section will therefore focus on optimal Cosmos DB usage. + +### Design considerations + +**Azure Cosmos DB** + +- Azure Cosmos DB stores data within Containers, which are indexed, row-based transactional stores designed to allow fast transactional reads and writes with response times on the order of milliseconds. + +- Cosmos DB supports multiple different APIs with differing feature sets, such as SQL, Cassandra, and Mongo DB. + - The first-party SQL API provides the richest feature set and is typically the API where new capabilities will become available first. + +- Cosmos DB supports [Gateway and Direct connectivity modes](https://docs.microsoft.com/azure/cosmos-db/sql-sdk-connection-modes), where Direct facilitates connectivity over TCP to backend Cosmos DB replica nodes for improved performance with fewer network hops, while Gateway provides HTTPS connectivity to frontend gateway nodes. + - Direct mode is only available when using the SQL API and is currently only supported on .NET and Java SDK platforms. + +- Within Availability Zone enabled regions, Cosmos DB offers [Availability Zone (AZ) redundancy](https://docs.microsoft.com/azure/cosmos-db/high-availability#availability-zone-support) support for high availability and resiliency to zonal failures within a region. + +- Cosmos DB maintains four replicas of data within a single region, and when Availability Zone (AZ) redundancy is enabled, Cosmos DB ensures data replicas are placed across multiple AZs to protect against zonal failures. + - The Paxos consensus protocol is applied to achieve quorum across replicas within a region. + +- A Cosmos DB account can easily be configured to replicate data across multiple regions to mitigate the risk of a single region becoming unavailable. + - Replication can be configured with either single-region writes or multi-region writes. + - With single region writes, a primary 'hub' region is used to serve all writes and if this 'hub' region becomes unavailable, a failover operation must occur to promote another region as writable. + - With multi-region writes, applications can write to any configured deployment region which will replicate changes between all other regions. If a region become unavailable then the remaining regions will be used to serve write traffic. + +- In a multi-region write configuration, [update (insert, replace, delete) conflicts](https://docs.microsoft.com/azure/cosmos-db/conflict-resolution-policies) can occur where writers concurrently update the same item in multiple regions. + +- Cosmos DB provides two conflict resolution policies which can be applied to automatically address conflicts. + - Last Write Wins (LWW) applies a time-synchronization clock protocol using a system-defined timestamp `_ts` property as the conflict resolution path. In the event of a conflict the item with the highest value for the conflict resolution path becomes the winner, and if multiple items have the same numeric value then the system selects a winner so that all regions can converge to the same version of the committed item. + - With delete conflicts, the deleted version always wins over either insert or replace conflicts regardless of conflict resolution path value. + - Last Write Wins is the default conflict resolution policy. + - When using the SQL API a custom numerical property, such as a custom timestamp definition, can be used for conflict resolution. + - Custom resolution policies allow for application-defined semantics to reconcile conflicts using a registered merge stored procedure which is automatically invoked when conflicts are detected. + - The system provides exactly once guarantee for the execution of a merge procedure as part of the commitment protocol. + - A custom conflict resolution policy is only available with the SQL API and can only be set at container creation time. + +- In a multi-region write configuration, there is a dependency on a single Cosmos DB 'hub' region to perform all conflict resolutions, with the Paxos consensus protocol applied to achieve quorum across replicas within the hub region. + - The platform provides a message buffer for write conflicts within the hub region to load level and provide redundancy for transient faults. + - The buffer is capable of storing a few minutes worth of write updates requiring consensus. + +> The strategic direction of the Cosmos DB platform is to remove this single region dependency for conflict resolution in a multi-region write configuration, utilizing a 2-phase Paxos approach to attain quorum at a global level as well as within a region. + +- The primary 'hub' region is determined by the first region Cosmos DB is configured within. + - A priority ordering is configured for additional satellite deployment regions for failover purposes. + +- The data model and partitioning across logical and physical partitions plays a hugely important role in achieving optimal performance and availability. + +- When deployed with a single write region, Cosmos DB can be configured for [automatic failover](https://docs.microsoft.com/azure/cosmos-db/autoscale-faq) based on a defined failover priority considering all read region replicas. + +- The RTO provided by the Cosmos DB platform is ~10-15 minutes, capturing the elapsed time to perform a regional failover of the Cosmos DB service in the event of a catastrophic disaster impacting the hub region. + - This RTO is also relevant in a multi-region write context given the dependency on a single 'hub' region for conflict resolution. + - If the 'hub' region becomes unavailable, writes made to other regions will fail after the message buffer fills since conflict resolution will not be able to occur until the service fails over and a new hub region is established. +> The strategic direction of the Cosmos DB platform is to reduce the RTO to ~5 minutes by allowing partition level failovers. + +- Recovery Point Objectives (RPO) and Recovery Time Objectives (RTO) are configurable via consistency levels, with a trade-off between data durability and throughput. + - Cosmos DB provides a minimum RTO of 0 for a relaxed consistency level with multi-region writes or an RPO of 0 for strong consistency with single-write region. + +- Cosmos DB offers a [99.999% SLA](https://azure.microsoft.com/support/legal/sla/cosmos-db/v1_3/) for both read and write availability for Database Accounts configured with multiple Azure regions as writable. + - The SLA is represented by the Monthly Uptime Percentage which is calculated as 100% - Average Error Rate. + - The Average Error Rate is defined as the sum of Error Rates for each hour in the billing month divided by the total number of hours in the billing month, where the Error Rate is the total number of Failed Requests divided by Total Requests during a given one-hour interval. + +- Cosmos DB offers a 99.99% SLA for throughput, consistency, availability, and latency for Database Accounts scoped to a single Azure region when configured with any of the five Consistency Levels. + - A 99.99% SLA also applies to Database Accounts spanning multiple Azure regions configured with any of the four relaxed Consistency Levels. + +- There are two types of throughput that can be provisioned in Cosmos DB, standard and [autoscale](https://docs.microsoft.com/azure/cosmos-db/provision-throughput-autoscale), which are measured using Request Units per second (RU/s). + - Standard throughput allocates resources required to guarantee a specified RU/s value. + - Standard is billed hourly for provisioned throughput. + - Autoscale defines a maximum throughput value, and Cosmos DB will automatically scale up or down depending on application load, between the maximum throughput value and a minimum of 10% of the maximum throughput value. + - Autoscale is billed hourly for the maximum throughput consumed. + +- Static provisioned throughput with a variable workload may result in throttling errors, which will impact perceived application availability. + - Autoscale protects against throttling errors by enabling Cosmos DB to scale up as needed, while maintaining cost protection by scaling back down when load decreases. + +- When Cosmos DB is replicated across multiple regions, the provisioned Request Units (RUs) are billed per region. + +- There is a significant cost delta between a multi-region-write and single-region-write configuration which in many cases may make a multi-master CosmosDB data platform cost prohibitive. + +| | Single Region Read/Write | Single Region Write - Dual Region Read | Dual Region Read/Write | +| --- | --- | --- | --- | +| Cost Model | 1 RU | 2 RU | 4 RU | + +> The delta between single-region-write and multi-region-write is actually less than the 1:2 ratio reflected in the table above. More specifically, there is a cross-region data transfer charge associated with write updates in a single-write configuration which is not captured within the RU costs as with the multi-write configuration. + +- Consumed storage is billed as a flat rate for the total amount of storage (GB) consumed to host data and indexes for a given hour. + +- `Session` is the default and most widely used [consistency level](https://docs.microsoft.com/azure/cosmos-db/consistency-levels) since data is received in the same order as writes. + +- Cosmos DB supports authentication via either an Azure Active Directory identity or Cosmos DB keys and resource tokens, which provide overlapping capabilities. + +[![Cosmos DB Access Capabilities](https://docs.microsoft.com/azure/cosmos-db/media/how-to-restrict-user-data/operations.png "Cosmos DB Access Capabilities")](./Data-Platform.md) + +- It is possible to disable resource management operations using keys or resource tokens to limit keys and resource tokens to data operations only, allowing for fine-grained resource access control using Azure AD (AAD) Role-Based Access Control (RBAC). + - Restricting control plane access via keys or resource tokens will disable control plane operations for clients using Cosmos DB SDKs and should therefore be thoroughly [evaluated and tested](https://docs.microsoft.com/azure/cosmos-db/role-based-access-control#check-list-before-enabling). + - The `disableKeyBasedMetadataWriteAccess` setting can be configured via [ARM Template](https://docs.microsoft.com/azure/cosmos-db/role-based-access-control#set-via-arm-template) IaC definitions, or via a [Built-In Azure Policy](https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyDetailBlade/definitionId/%2Fproviders%2FMicrosoft.Authorization%2FpolicyDefinitions%2F4750c32b-89c0-46af-bfcb-2e4541a818d5). + +- Cosmos DB AAD RBAC support applies to account and resource control plane management operations. + - Application administrators can create role assignments for users, groups, service principals or managed identities to grant or deny access to resources and operations on Cosmos DB resources. + - There are several [Built-in RBAC Roles](https://docs.microsoft.com/azure/cosmos-db/role-based-access-control#built-in-roles) available for role assignment, and [custom RBAC roles](https://docs.microsoft.com/azure/cosmos-db/role-based-access-control#custom-roles) can also be used to form specific [privilege combinations](https://docs.microsoft.com/azure/role-based-access-control/resource-provider-operations#microsoftdocumentdb). + - [Cosmos DB Account Reader](https://docs.microsoft.com/azure/role-based-access-control/built-in-roles#cosmos-db-account-reader-role) enables read-only access to the Cosmos DB resource. + - [DocumentDB Account Contributor](https://docs.microsoft.com/azure/role-based-access-control/built-in-roles#documentdb-account-contributor) enables management of Cosmos DB accounts including keys and role assignments, but does not enable data-plane access. + - [Cosmos DB Operator](https://docs.microsoft.com/azure/role-based-access-control/built-in-roles#cosmos-db-operator) which is similar to DocumentDB Account Contributor, but does not provide the ability to manage keys or role assignments. + +- Cosmos DB resources (accounts, databases, and containers) can be protected against incorrect modification or deletion using [Resource Locks](https://docs.microsoft.com/azure/cosmos-db/resource-locks). + - Resource Locks can be set at the account, database, or container level. + - A Resource Lock set at on a resource will be inherited by all child resources. For example, a Resource Lock set on the Cosmos DB account will be inherited by all databases and containers within the account. + - Resource Locks **only** apply to control plane operations and do **not** prevent data plane operations, such as creating, changing, or deleting data. + - If control plane access is not restricted with `disableKeyBasedMetadataWriteAccess`, then clients will be able to perform control plane operations using account keys. + +- The [Cosmos DB Change Feed](https://docs.microsoft.com/azure/cosmos-db/change-feed) provides a time-ordered feed of changes to data in a Cosmos DB Container. + - The Change Feed only includes insert and update operations to the source Cosmos DB Container; it does not include deletes. + +- The Change Feed can be used to maintain a separate data store from the primary Container used by the application, with ongoing updates to the target data store fed by the Change Feed from the source Container. + - The Change Feed can be used to populate a secondary store for additional data platform redundancy or for subsequent analytical scenarios. + +- If delete operations routinely affect the data within the source Container, then the store fed by the Change Feed will be inaccurate and unreflective of deleted data. + - A [Soft Delete](https://docs.microsoft.com/azure/cosmos-db/sql/change-feed-design-patterns#deletes) pattern can be implemented so that data records are included in the Change Feed. + - Instead of explicitly deleting data records, data records are _updated_ by setting a flag (e.g. `IsDeleted`) to indicate that the item is considered deleted. + - Any target data store fed by the Change Feed will need to detect and process items with a deleted flag set to True; instead of storing soft-deleted data records, the _existing_ version of the data record in the target store will need to be deleted. + - A short Time-To-Live (TTL) is typically used in conjunction with the soft-delete pattern so that Cosmos DB automatically deletes expired data, but only after it is reflected within the Change Feed with the deleted flag set to True. + - Accomplishes the original delete intent whilst also propagating the delete through the Change Feed. + +- Cosmos DB can be configured as an [analytical store](https://docs.microsoft.com/azure/cosmos-db/analytical-store-introduction), which applies a column format for optimized analytical queries to address the complexity and latency challenges that occur with the traditional ETL pipelines. + +- Azure Cosmos DB automatically backs up data at regular intervals without affecting the performance or availability, and without consuming RU/s. + +- Cosmos DB can be configured according to two distinct backup modes. + - [Periodic](https://docs.microsoft.com/azure/cosmos-db/configure-periodic-backup-restore) is the default backup mode for all accounts, where backups are taken at a periodic interval and the data is restored by creating a request with the support team. + - The default periodic backup retention period is 8 hours and the default backup interval is 4 hours, which means only the latest two backups are stored by default. + - The backup interval and retention period are configurable within the account. + - The maximum retention period extends to a month with a minimum backup interval of one hour. + - A role assignment to the Azure "Cosmos DB Account Reader Role" is required to configure backup storage redundancy. + - Two backup copies are included at no extra cost, but additional backups incur additional costs. + - By default, periodic backups are stored within separate Geo-Redundant Storage (GRS) that is not directly accessible. + - Backup storage exists within the primary 'hub' region and is replicated to the paired region through underlying storage replication. + - The redundancy configuration of the underlying backup storage account is configurable to [Zone-Redundant Storage or Locally-Redundant Storage](https://docs.microsoft.com/azure/cosmos-db/configure-periodic-backup-restore#backup-storage-redundancy). + - Performing a **restore operation requires a [Support Request](https://docs.microsoft.com/azure/cosmos-db/configure-periodic-backup-restore#request-restore)** since customers cannot directly perform a restore. + - Before opening a support ticket, the backup retention period should be increased to at least seven days within eight hours of the data loss event. + - A restore operation creates a new Cosmos DB account to which data is recovered. + - An existing Cosmos DB account cannot be used for Restore + - By default, a new Cosmos DB account named `-restored` will be used. + - This name can be adjusted, such as by reusing the existing name if the original account was deleted. + - If throughput is provisioned at the database level, backup and restore will happen at the database level + - It is not possible to select a subset of containers to restore. + - [Continuous](https://docs.microsoft.com/azure/cosmos-db/continuous-backup-restore-introduction) backup mode allows for a restore to any point of time within the last 30 days. + - Restore operations can be performed to return to a specific point in time (PITR) with a one-second granularity. + - The available window for restore operations is up to 30 days. + - It is also possible to restore to the resource instantiation state. + - Continuous backups are taken within every Azure region where the Cosmos DB account exists. + - Continuous backups are stored within the same Azure region as each Cosmos DB replica, using Locally-Redundant Storage (LRS) or Zone Redundant Storage (ZRS) within regions that support Availability Zones. + - A self-service restore can be performed using the [Azure portal](https://docs.microsoft.com/azure/cosmos-db/restore-account-continuous-backup#restore-account-portal) or IaC artefacts such as [ARM templates](https://docs.microsoft.com/azure/cosmos-db/restore-account-continuous-backup#restore-arm-template). + - There several [limitations](https://docs.microsoft.com/azure/cosmos-db/continuous-backup-restore-introduction#current-limitations) with Continuous Backup. + - The continuous backup mode is not currently available in a multi-region-write configuration. + - Only SQL API and MongoDB API can be configured for Continuous backup at this time. + - If a container has TTL configured, restored data that has exceeded its TTL may be _immediately deleted_ + - A restore operation creates a new Cosmos DB account for the point-in-time restore. + - There is an [additional storage cost](https://docs.microsoft.com/azure/cosmos-db/continuous-backup-restore-introduction#continuous-backup-pricing) for Continuous backups and restore operations. + +- Existing Cosmos DB accounts can be migrated from Periodic to Continuous, but not from Continuous to Periodic; migration is one-way and not reversible. + +- Each Cosmos DB backup is comprised of the data itself and configuration details for provisioned throughput, indexing policies, deployment region(s), and container TTL settings. + - Backups do not contain [firewall settings](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts?tabs=json#ipaddressorrange-object), [virtual network access control lists](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts/privateendpointconnections), [private endpoint settings](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts/privateendpointconnections), [consistency settings](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts?tabs=json#consistencypolicy-object) (an account is restored with session consistency), [stored procedures](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts/sqldatabases/containers/storedprocedures), [triggers](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts/sqldatabases/containers/triggers), [UDFs](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts/sqldatabases/containers/userdefinedfunctions), or [multi-region settings](https://docs.microsoft.com/azure/templates/microsoft.documentdb/databaseaccounts?tabs=json#Location). + - Customers are responsible for re-deploying capabilities and configuration settings which are not restored through Cosmos DB backup. + - Azure Synapse Link analytical store data is also not included in Cosmos DB backups. + +- It is possible to implement a custom backup and restore capability for scenarios where Periodic and Continuous approaches are not a good fit. + - A custom approach introduces significant costs and additional administrative overhead which should be understood and carefully assessed. + - Common restore scenarios should be modelled, such as the corruption or deletion of an account, database, container, on data item. + - Housekeeping procedures should be implemented to prevent backup sprawl. + - Azure Storage or an alternative data technology can be used, such an alternative Cosmos DB container. + - Azure Storage and Cosmos DB provide native integrations with Azure services such as Azure Functions and Azure Data Factory. + +- The Cosmos DB documentation denotes two potential options for implementing custom backups. + - [Cosmos DB change feed](https://docs.microsoft.com/azure/cosmos-db/change-feed) to write data to a separate storage facility. + - An [Azure Function](https://docs.microsoft.com/azure/cosmos-db/change-feed-functions) or equivalent application process uses the [Change Feed Processor](https://docs.microsoft.com/azure/cosmos-db/change-feed-processor) to bind to the change feed and process items into storage. + - Both continuous or periodic (batched) custom backups can be implemented using the Change Feed. + - The Cosmos DB change feed does not yet reflect deletes, so a soft-delete pattern must be applied using a boolean property and TTL. + - This pattern will not be required when the change feed provides full-fidelity updates. + - [Azure Data Factory Connector for Cosmos DB](https://docs.microsoft.com/azure/data-factory/connector-azure-cosmos-db) ([SQL API](https://docs.microsoft.com/azure/data-factory/connector-azure-cosmos-db) or [MongoDB API](https://docs.microsoft.com/azure/data-factory/connector-azure-cosmos-db-mongodb-api) connectors) to copy data. + - Azure Data Factory (ADF) supports manual execution as well as [Schedule](https://docs.microsoft.com/azure/data-factory/concepts-pipeline-execution-triggers#schedule-trigger), [Tumbling window](https://docs.microsoft.com/azure/data-factory/concepts-pipeline-execution-triggers#tumbling-window-trigger), and [Event-based](https://docs.microsoft.com/azure/data-factory/concepts-pipeline-execution-triggers#event-based-trigger) triggers. + - Provides support for both Storage and Event Grid. + - ADF is primarily suitable for periodic custom backup implementations due to its batch-oriented orchestration. + - It is less suitable for continuous backup implementations with frequent events due to the orchestration execution overhead. + - ADF supports [Azure Private Link](https://docs.microsoft.com/azure/data-factory/data-factory-private-link) for high network security scenarios + +> Azure Cosmos DB is leveraged within the design of many Azure services, so a significant regional outage for Cosmos DB will have a cascading effect across various Azure services within that region. The precise impact to a particular service will heavily depend on how the underlying service design uses Cosmos DB. + +### Design Recommendations + +- In line with microservices application design approach, it is strongly recommended to have a separate datastore instance/type per microservice. + - Separate analytical workloads from application workloads using different data technologies optimized for distinct performance, reliability, and scalability requirements. + +**Azure Cosmos DB** + +- Use Azure Cosmos DB as the primary AlwaysOn data platform where requirements allow. + +- For mission-critical workload scenarios, configure Cosmos DB with a write replica inside each deployment region to reduce latency and provide maximum redundancy. + - Configure the application to [prioritize the use of the local Cosmos DB replica](https://docs.microsoft.com/dotnet/api/microsoft.azure.cosmos.cosmosclientoptions.applicationregion) for writes and reads to optimize application load, performance, and regional RU/s consumption. + - The multi-region-write configuration comes at a significant cost and should be prioritized only for workload scenarios requiring maximum reliability. + +- For less-critical workload scenarios, prioritize the use of single-region-write configuration (when using Availability Zones) with globally distributed read replicas, since this offers a high level of data platform reliability (99.999% SLA for read-, 99.995% SLA for write-operations) at a more compelling price-point. + - Configure the application to use the local Cosmos DB read replica to optimize read performance. + +- Select an optimal 'hub' deployment region where conflict resolution will occur in a multi-region-write configuration, and all writes will be performed in a single-region-write configuration. + - Consider distance relative to other deployment regions and associated latency in selecting a primary region, as well as requisite capabilities such as Availability Zones support. + +- Configure Cosmos DB with [Availability Zone (AZ) redundancy](https://docs.microsoft.com/azure/cosmos-db/high-availability#availability-zone-support) in all deployment regions with AZ support, to ensure resiliency to zonal failures within a region. + +- Use the Cosmos DB native SQL API since it offers the most comprehensive feature set, particularly where performance tuning is concerned. + - Alternative APIs should primarily be considered for migration or compatibility scenarios. + - When using alternative APIs, validate that required capabilities are available with the selected language and SDK to ensure optimal configuration and performance. + +- Use the Direct connection mode to optimize network performance through direct TCP connectivity to backend Cosmos DB nodes, with a reduced number of network 'hops'. + +> The Cosmos DB SLA is calculated by averaging failed requests which may not directly align with a 99.999% reliability tier error budget. When designing for 99.999% SLO, it is therefore vital to plan for regional and multi-region Cosmos DB write unavailability, ensuring a fallback storage technology is positioned in the event of a failure, such as a persisted message queue for subsequent replay. + +- Define a partitioning strategy across both logical and physical partitions to optimize data distribution according to the data model. + - Minimize cross-partition queries. + - Iteratively [test and validate](https://docs.microsoft.com/azure/cosmos-db/how-to-model-partition-example) the partitioning strategy to ensure optimal performance. + +- [Select an optimal partition key](https://docs.microsoft.com/azure/cosmos-db/partitioning-overview#choose-partitionkey). + - The partition key cannot be changed after it has been created within the collection. + - The partition key should be a property value which does not change. + - Select a partition key which has a high cardinality, with a wide range of possible values. + - The partition key should spread RU consumption and data storage evenly across all logical partitions to ensure even RU consumption and storage distribution across physical partitions. + - Run read queries against the partitioned column to reduce RU consumption and latency. + +- [Indexing](https://docs.microsoft.com/azure/cosmos-db/index-overview) is also crucial for performance, so ensure index exclusions are used to reduce RU/s and storage requirements. + - Only index those fields which are needed for filtering within queries; design indexes for the most-used predicates. + +- Leverage the built-in error handling, retry, and broader reliability capabilities of the [Cosmos DB SDK](https://docs.microsoft.com/azure/cosmos-db/sql/best-practice-dotnet#checklist). + - Implement [retry logic](https://docs.microsoft.com/azure/architecture/best-practices/retry-service-specific#cosmos-db) within the SDK on clients. + +- Use service-managed encryption keys to reduce management complexity. + - If there is a specific security requirement for customer-managed keys, ensure appropriate key management procedures are applied, such as backup and rotation. + +- Disable [Cosmos DB key based metadata write access](https://portal.azure.com/#blade/Microsoft_Azure_Policy/PolicyDetailBlade/definitionId/%2Fproviders%2FMicrosoft.Authorization%2FpolicyDefinitions%2F4750c32b-89c0-46af-bfcb-2e4541a818d5) by applying the built-in Azure Policy. + +- Enable [Azure Monitor](https://docs.microsoft.com/azure/cosmos-db/monitor-cosmos-db) to gather key metrics and diagnostic logs, such as provisioned throughput (RU/s). + - Route Azure Monitor operational data into a Log Analytics workspace dedicated to Cosmos DB and other global resources within the AlwaysOn application design. + - Use Azure Monitor metrics to determine if application traffic pattern are suitable for autoscale. + +- Evaluate application traffic patterns to select an optimal option for [provisioned throughput types](https://docs.microsoft.com/azure/cosmos-db/how-to-choose-offer). + - Consider auto-scale provisioned throughput to automatically level-out workload demand. + +- Evaluate Microsoft [performance tips for Cosmos DB](https://docs.microsoft.com/azure/cosmos-db/performance-tips) to optimize client-side and server-side configuration for improved latency and throughput. + +- When using AKS as the compute platform: For query-intensive workloads, select an AKS node SKU that has accelerated networking enabled to reduce latency and CPU jitters. + +- For single write region deployments, it is strongly recommended to configure Cosmos DB for [automatic failover](https://docs.microsoft.com/azure/cosmos-db/high-availability#multi-region-accounts-with-a-single-write-region-write-region-outage). + +- Load-level through the use of asynchronous non-blocking messaging within system flows which write updates to Cosmos DB. + - Consider patterns such as [Command and Query Responsibility Segregation](https://docs.microsoft.com/azure/architecture/patterns/cqrs) and [Event Sourcing](https://docs.microsoft.com/azure/architecture/patterns/event-sourcing). + +- Configure the Cosmos DB account for continuous backups to obtain a fine granularity of recovery points across the last 30 days. + - Consider the use of Cosmos DB backups in scenarios where contained data or the Cosmos DB account is deleted or corrupted. + - Avoid the use of a custom backup approach unless absolutely necessary. + +- It is strongly recommended to practice recovery procedures on non-production resources and data, as part of standard business continuity operation preparation. + +- Define IaC artefacts to re-establish configuration settings and capabilities in the event of a Cosmos DB backup restore. + +- Evaluate and apply the [Azure Security Baseline](https://docs.microsoft.com/security/benchmark/azure/baselines/cosmos-db-security-baseline#backup-and-recovery) control guidance for Cosmos DB Backup and Recovery. + - [BR-1: Ensure regular automated backups](https://docs.microsoft.com/security/benchmark/azure/baselines/cosmos-db-security-baseline#br-1-ensure-regular-automated-backups) + - [BR-3: Validate all backups including customer-managed keys](https://docs.microsoft.com/security/benchmark/azure/baselines/cosmos-db-security-baseline#br-3-validate-all-backups-including-customer-managed-keys) + - [BR-4, Mitigate risk of lost keys](https://docs.microsoft.com/security/benchmark/azure/baselines/cosmos-db-security-baseline#br-4-mitigate-risk-of-lost-keys) + +- For analytical workloads requiring multi-region availability, leverage the Cosmos DB Analytical Store, which applies a column format for optimized analytical queries. + +## Relational data technologies + +For scenarios with a highly relational data model or dependencies on existing relational technologies, the use of Azure Cosmos DB in a multi-write configuration might not be directly applicable. In such cases, it is vital that leveraged relational technologies are designed and configured to uphold the multi-region active-active aspirations of an AlwaysOn application design. + +Azure provides a variety of managed relational data platforms, including Azure SQL Database and Azure Database for common OSS relational solutions, including MySQL, PostgreSQL, and MariaDB. The design considerations and recommendations within this section will therefore focus on the optimal usage of Azure SQL Database and Azure Database OSS flavors to maximize reliability and global availability. + +### Design considerations + +- Whilst relational data technologies can be configured to easily scale read operations, writes are typically constrained to go through a single primary instance, which places a significant constraint on scalability and performance. + +- [Sharding](https://docs.microsoft.com/azure/sql-database/sql-database-elastic-scale-introduction) can be applied to distribute data and processing across multiple identical structured databases, partitioning databases horizontally to navigate platform constraints. + - For example, sharding is often applied in multi-tenant SaaS platforms to isolate groups of tenants into distinct data platform constructs. + +**Azure SQL Database** + +- Azure SQL Database provides a fully managed database engine that is always running on the latest stable version of the SQL Server database engine and underlying Operating System. + - Provides intelligent features such as performance tuning, threat monitoring, and vulnerability assessments. + +- Azure SQL Database provides built-in regional high availability and turnkey geo-replication to distribute read-replicas across Azure regions. + - With geo-replication, secondary database replicas remain read-only until a failover is initiated. + - Up to 4 secondaries are supported in the same or different regions. + - Secondary replicas can also be used for read-only query access to optimize read performance. + - Failover must be initiated manually but can be wrapped in automated operational procedures. + +- Azure SQL Database provides [Auto Failover Groups](https://docs.microsoft.com/azure/azure-sql/database/auto-failover-group-overview?tabs=azure-powershell) which replicates databases to a secondary server and allows for transparent failover in the event of a failure. + - Auto-failover groups support geo-replication of all databases in the group to only one secondary server or instance in a different region. + - Auto-failover groups are not currently supported in the Hyperscale service tier. + - Secondary databases can be used to offload read traffic. + +- Premium or Business Critical service tier database replicas can be [distributed across Availability Zones](https://docs.microsoft.com/azure/azure-sql/database/high-availability-sla) at no extra cost. + - The control ring is also duplicated across multiple zones as 3 gateway rings (GW). + - The routing to a specific gateway ring is controlled by Azure Traffic Manager. + - When using the Business Critical tier, zone redundant configuration is only available when the Gen5 compute hardware is selected. + +- Azure SQL Database offers a baseline 99.99% availability SLA across all of its service tiers, but provides a higher 99.995% SLA for the Business Critical or Premium tiers in regions that support availability zones. + - Azure SQL Database Business Critical or Premium tiers not configured for Zone Redundant Deployments have an availability SLA of 99.99%. + +- When configured with geo-replication, the Azure SQL Database Business Critical tier provides a Recovery Time Objective (RTO) of 30 seconds for 100% of deployed hours. + +- When configured with geo-replication, the Azure SQL Database Business Critical tier has a Recovery point Objective (RPO) of 5 seconds for 100% of deployed hours. + +- Azure SQL Database Hyperscale tier, when configured with at least two replicas, has an availability SLA of 99.99%. + +- Compute costs associated with Azure SQL Database can be reduced using a [Reservation Discount](https://docs.microsoft.com/azure/cost-management-billing/reservations/understand-reservation-charges). + - It is not possible to apply reserved capacity for DTU-based databases. + +- [Point-in-time restore](https://docs.microsoft.com/azure/azure-sql/database/recovery-using-backups#point-in-time-restore) can be used to return a database and contained data to an earlier point in time. + +- [Geo-restore](https://docs.microsoft.com/azure/sql-database/sql-database-recovery-using-backups) can be used to recover a database from a geo-redundant backup. + +**Azure Database For PostgreSQL** + +- Azure Database For PostgreSQL is offered in three different deployment options: + - Single Server, SLA 99.99% + - Flexible Server, which offers Availability Zone redundancy, SLA 99.99% + - Hyperscale (Citus), SLA 99.95% when High Availability mode is enabled. + +- [Hyperscale (Citus)](https://docs.microsoft.com/azure/postgresql/tutorial-hyperscale-shard) provides dynamic scalability through sharding without application changes. + - Distributing table rows across multiple PostgreSQL servers is key to ensure scalable queries in Hyperscale (Citus). + - Multiple nodes can collectively hold more data than a traditional database, and in many cases can use worker CPUs in parallel to optimize costs. + +- [Autoscale](https://techcommunity.microsoft.com/t5/azure-database-support-blog/how-to-auto-scale-an-azure-database-for-mysql-postgresql/ba-p/369177) can be configured through runbook automation to ensure elasticity in response to changing traffic patterns. + +- Flexible server provides cost efficiencies for non-production workloads through the ability to stop/start the server, as well as a burstable compute tier that is suitable for workloads which do not require continuous compute capacity. + +- There is no additional charge for backup storage for up to 100% of total provisioned server storage. + - Additional consumption of backup storage is charged according to consumed GB/month. + +- Compute costs associated with Azure Database for PostgreSQL can be reduced using a either a [Single Server Reservation Discount](https://docs.microsoft.com/azure/postgresql/concept-reserved-pricing) or [Hyperscale (Citus) Reservation Discount](https://docs.microsoft.com/azure/postgresql/concepts-hyperscale-reserved-pricing). + +### Design Recommendations + +- Flexible Server is recommended to use it for business critical workloads due to its Availability Zone support. + +- When using Hyperscale (Citus) for business critical workloads, enable High Availability mode to receive the 99.95% SLA guarantee. + +- Consider sharding to partition relational databases based on different application and data contexts, helping to navigate platform constraints, maximize scalability and availability, as well as fault isolation. + - This recommendation is particularly prevalent when the AlwaysOn application design considers 3 or more Azure regions since relational technology constraints can significantly hinder globally distributed data platforms. + - Sharding is not appropriate for all application scenarios, so a contextualized evaluation is required. + +- Prioritize the use of Azure SQL Database where relational requirements exist due to its maturity on the Azure platform and wide array of relaibility capabilities. + +**Azure SQL Database** + +- Use the Business-Critical service tier to maximize reliability and availability, including access to critical resiliency capabilities. + +- Use the vCore based consumption model to facilitate the independent selection of compute and storage resources, tailored to workload volume and throughput requirements. + - Ensure a defined capacity model is applied to inform compute and storage resource requirements. + - Consider [Reserved Capacity](https://docs.microsoft.com/azure/azure-sql/database/reserved-capacity-overview) to provide potential cost optimizations. + +- Configure the Zone-Redundant deployment model to spread Business Critical database replicas within the same region across Availability Zones. + +- Use [Active Geo-Replication](https://docs.microsoft.com/azure/azure-sql/database/active-geo-replication-overview) to deploy readable replicas within all deployment regions. + +- Use Auto Failover Groups to provide [transparent failover](https://docs.microsoft.com/azure/azure-sql/database/designing-cloud-solutions-for-disaster-recovery) to a secondary region, with geo-replication applied to provide replication to additional deployment regions for read optimization and database redundancy. + - For application scenarios limited to only two deployment regions, the use of Auto Failover Groups should be prioritized. + +- Consider automated operational triggers, based on alerting aligned to the application health model, to conduct failovers to geo-replicated instances in the event of a failure impacting the primary and secondary within the Auto Failover Group. + +> For AlwaysOn applications considering more than 4 deployment regions, serious consideration should be given to application scoped sharding or refactoring the application to support multi-region write technologies, such as Azure Cosmos DB. However, if this is not feasible within the application workload scenario, it is advised to elevate a region within a single geography to a primary status encompassing a geo-replicated instance to more evenly distribute read access. + +- Configure the application to query replica instances for read queries to optimize read performance. + +- Use Azure Monitor and [Azure SQL Analytics](https://docs.microsoft.com/azure/azure-monitor/insights/azure-sql#analyze-data-and-create-alerts) for near real-time operational insights in Azure SQL DB for the detection of reliability incidents. + +- Use Azure Monitor to evaluate usage for all databases to determine if they have been sized appropriately. + - Ensure CD pipelines consider load testing under representative load levels to validate appropriate data platform behavior. + +- Calculate a health metric for database components to observe health relative to business requirements and resource utilization, leveraging [monitoring and alerts](https://docs.microsoft.com/azure/azure-sql/database/monitor-tune-overview) to drive automated operational action where appropriate. + - Ensure key query performance metrics are incorporated so swift action can be taken when service degradation occurs. + +- Optimize queries, tables, and databases using [Query Performance Insights](https://docs.microsoft.com/azure/azure-sql/database/query-performance-insight-use) and common [performance recommendations](https://docs.microsoft.com/azure/azure-sql/database/database-advisor-find-recommendations-portal) provided by Microsoft. + +- [Implement retry logic](https://docs.microsoft.com/azure/azure-sql/database/troubleshoot-common-connectivity-issues) using the SDK to mitigate transient errors impacting Azure SQL Database connectivity. + +- Prioritize the use of service-managed keys when applying server-side Transparent Data Encryption (TDE) for at-rest encryption. + - If customer-managed keys or client-side (AlwaysEncrypted) encryption is required, ensure keys are appropriately resilient with backups and automated rotation facilities. + +- Consider the use of [point-in-time restore](https://docs.microsoft.com/azure/azure-sql/database/recovery-using-backups#point-in-time-restore) as an operational playbook to recover from severe configuration errors. + +**Azure Database For PostgreSQL** + +- Use the [Hyperscale (Citus)](https://docs.microsoft.com/azure/postgresql/concepts-hyperscale-configuration-options) server configuration to maximize availability across multiple nodes. + +- Define a capacity model for the application to inform compute and storage resource requirements within the data platform. + - Consider the [Hyperscale (Citus) Reservation Discount](https://docs.microsoft.com/azure/postgresql/concepts-hyperscale-reserved-pricing) to provide potential cost optimizations. + +## Caching for Hot Tier Data + +An in-memory caching layer can be applied to enhance an AlwaysOn data platform by significantly increasing read throughput and improving end-to-end client response times for hot tier data scenarios. + +Azure provides several services with applicable capabilities for caching key data structures, with Azure Cache for Redis positioned to abstract and optimize data platform read access. This section will therefore focus on the optimal usage of Azure Cache for Redis in scenarios where additional read performance and data access durability is required. + +### Design Considerations + +- A caching layer provides additional data access durability since even in the event of an outage impacting the underlying data technologies, an application data snapshot can still be accessed through the caching layer. + +- In certain workload scenarios, in-memory caching can be implemented within the application platform itself. + +**Azure Cache for Redis** + +- Redis cache is an open source NoSQL key-value in-memory storage system. + +- The Enterprise and Enterprise Flash tiers can be deployed in an active-active configuration across Availability Zones within a region and different Azure regions through geo-replication. + - When deployed across at least 3 Azure regions and 3 or more Availability Zones in each region, with active geo-replication enabled for all Cache instances, Azure Cache for Redis provides an SLA of 99.999% for connectivity to one regional cache endpoint. + - When deployed across 3 Availability Zones within a single Azure region a 99.99% connectivity SLA is provided. + +- The Enterprise Flash tier runs on a combination of RAM and flash non-volatile memory storage, and while this introduces a small performance penalty it also enables very large cache sizes, up to 13TB with clustering. + +- With geo-replication, charges for data transfer between regions will also be applicable in addition to the direct costs associated with cache instances. + +- The Scheduled Updates feature does not include Azure updates or updates applied to the underlying VM operating system. + +- There will be a increase in CPU utilization during a scale-out operation while data is migrated to new instances. + +### Design Recommendations + +- Consider an optimized caching layer for 'hot' data scenarios to increase read throughput and improve response times. + +- Apply appropriate policies for cache expiration and housekeeping to avoid runaway data growth. + - Consider expiring cache items when the backing data changes. + +**Azure Cache for Redis** + +- Use the Premium or Enterprise SKU to maximize reliability and performance. + - For scenarios with extremely large data volumes, the Enterprise Flash tier should be considered. + - For scenarios where only passive geo-replication is required, the Premium tier can also be considered. + +- Deploy replica instances using geo-replication in an active configuration across all considered deployment regions. + +- Ensure replica instances are deployed across Availability Zones within each considered Azure region. + +- Use Azure Monitor to evaluate Azure Cache for Redis. + - Calculate a health score for regional cache components to observe health relative to business requirements and resource utilization. + - Observe and alert on key metrics such as high CPU, high memory usage, high server load, and evicted keys for insights when to scale the cache. + +- Optimize [connection resilience](https://docs.microsoft.com/azure/azure-cache-for-redis/cache-best-practices-connection) by implementing retry logic, timeouts, and using a singleton implementation of the Redis connection multiplexer. + +- Configure [scheduled updates](https://docs.microsoft.com/azure/azure-cache-for-redis/cache-administration#schedule-updates) to prescribe the days and times that Redis Server updates are applied to the cache. + +## Analytical Scenarios + +It is increasingly common for mission-critical applications to consider analytical scenarios as a means to drive additional value from encompassed data flows. Application and operational (AIOps) analytical scenarios therefore form a crucial aspect of an AlwaysOn data platform. + +Analytical and transactional workloads require different data platform capabilities and optimizations for acceptable performance within their respective contexts. + +| | Analytical | Transactional | +| --- | --- | --- | +| Use Case | Analyze very large volumes of data ("big data") | Process very large volumes of individual transactions | +| Optimized for | Read queries and aggregations over very many records | Near real-time Create/Read/Update/Delete (CRUD) queries over few records | +| Key Characteristics | - Consolidate from data sources of record
- Column-based storage
- Distributed storage
- Parallel processing
- Denormalized
- Low concurrency reads and writes
- Optimize for storage volume with compression | - Data source of record for application
- Row-based Storage
- Contiguous storage
- Symmetrical processing
- Normalized
- High concurrency reads and writes, index updates
- Optimize for fast data access with in-memory storage + +Azure Synapse provides an enterprise analytical platform that brings together relational and non-relational data with Spark technologies, leveraging built-in integration with Azure services such as Azure Cosmos DB to facilitate big data analytics. The design considerations and recommendations within this section will therefore focus on optimal Azure Synapse and Azure Cosmos DB usage for AlwaysOn analytical scenarios. + +### Design Considerations + +- Traditionally, large-scale analytical scenarios are facilitated by extracting data into a separate data platform optimized for subsequent analytical queries. + - Extract, Transform, and Load (ETL) pipelines are used to extract data will consume throughput and impact transactional workload performance. + - Running ETL pipelines infrequently to reduce throughput and performance impacts will result in analytical data that is less up-to-date. + - ETL pipeline development and maintenance overhead increases as data transformations become more complex. + - For example, if source data is frequently changed or deleted, ETL pipelines must account for those changes in the target data for analytical queries through an additive/versioned approach, dump and reload, or in-place changes on the analytical data. Each of these approaches will have derivative impact, such as index re-creation or update. + +**Azure Cosmos DB** + +- Analytical queries run on Cosmos DB transactional data will typically aggregate across partitions over large volumes of data, consuming significant Request Unit (RU) throughput, which can impact the performance of surrounding transactional workloads. + +- The [Cosmos DB Analytical Store](https://docs.microsoft.com/azure/cosmos-db/analytical-store-introduction) provides a schematized, fully isolated column-oriented data store which enables large-scale analytics on Cosmos DB data from Azure Synapse without impact to Cosmos DB transactional workloads. + - When a Cosmos DB Container is enabled as an Analytical Store, a new column store is internally created from the operational data in the Container. This column store is persisted separately from the row-oriented transaction store for the container. + - Create, Update and Delete operations on the operational data are automatically synced to the analytical store, so no Change Feed or ETL processing is required. + - Data sync from the operational to the analytical store does not consume throughput Request Units (RUs) provisioned on the Container or Database. There is no performance impact on transactional workloads. Analytical Store does not require allocation of additional RUs on a Cosmos DB Database or Container. + - Auto-Sync is the process by which operational data changes are automatically synced to the Analytical Store. Auto-Sync latency is usually less than two (2) minutes. + - Auto-Sync latency can be up to five (5) minutes for a Database with shared throughput and a large number of Containers. + - As soon as Auto-Sync completes, the latest data can be queried from Azure Synapse. + - Analytical Store storage uses a consumption-based [pricing model](https://azure.microsoft.com/pricing/details/cosmos-db/) which charges for volume of data and number of read and write operations. Analytical store pricing is separate from transactional store pricing. + +- Using Azure Synapse Link, the Cosmos DB Analytical Store can be queried directly from Azure Synapse. This enables no-ETL, Hybrid Transactional-Analytical Processing (HTAP) from Synapse, so that Cosmos DB data can be queried together with other analytical workloads from Synapse in near real-time. + +- The Cosmos DB Analytical Store is not partitioned by default. + - For certain query scenarios, performance will improve by [partitioning Analytical Store](https://docs.microsoft.com/azure/cosmos-db/configure-custom-partitioning) data using keys that are frequently used in query predicates. + - Partitioning is triggered by a job in Azure Synapse that runs a Spark notebook using Synapse Link, which loads the data from the Cosmos DB Analytical Store and writes it into the Synapse partitioned store in the primary storage account of the Synapse workspace. + +- [Azure Synapse Analytics SQL Serverless pools can query the Analytical Store](https://docs.microsoft.com/azure/synapse-analytics/sql/query-cosmos-db-analytical-store) through automatically updated views or via `SELECT / OPENROWSET` commands. + +- [Azure Synapse Analytics Spark pools can query the Analytical Store](https://docs.microsoft.com/azure/synapse-analytics/synapse-link/how-to-query-analytical-store-spark-3) through automatically updated Spark tables or the `spark.read` command. + +- Data can also be [copied from the Cosmos DB Analytical Store into a dedicated Synapse SQL pool using Spark](https://docs.microsoft.com/azure/synapse-analytics/synapse-link/how-to-copy-to-sql-pool), so that provisioned Azure Synapse SQL pool resources can be used. + +- Cosmos DB Analytical Store data can be queried with [Azure Synapse Spark](https://docs.microsoft.com/azure/synapse-analytics/synapse-link/how-to-query-analytical-store-spark-3). + - Spark notebooks allow for [Spark dataframe](https://docs.microsoft.com/azure/synapse-analytics/synapse-link/how-to-query-analytical-store-spark-3#load-to-spark-dataframe) combinations to aggregate and transform Cosmos DB analytical data with other data sets, and use other advanced Synapse Spark functionality including writing transformed data to other stores or training AIOps Machine Learning models. + +[![Cosmos DB Analytical Column Store](https://docs.microsoft.com/azure/cosmos-db/media/analytical-store-introduction/transactional-analytical-data-stores.png "Cosmos DB Analytical Column Store")](./Data-Platform.md) + +- The [Cosmos DB Change Feed](https://docs.microsoft.com/azure/cosmos-db/change-feed) can also be used to maintain a separate secondary data store for analytical scenarios. + +**Azure Synapse** + +- [Azure Synapse](https://docs.microsoft.com/azure/synapse-analytics/overview-what-is) brings together analytics capabilities including SQL data warehousing, Spark big data, and Data Explorer for log and time series analytics. + - Azure Synapse uses _linked services_ to define connections to other services, such as Azure Storage. + - Data can be ingested into Synapse Analytics via Copy activity from [supported sources](https://docs.microsoft.com/azure/data-factory/copy-activity-overview?context=/azure/synapse-analytics/context/context&tabs=synapse-analytics#supported-data-stores-and-formats). This permits data analytics in Synapse without impacting the source data store, but adds time, cost and latency overhead due to data transfer. + - Data can also be queried in-place in supported external stores, avoiding the overhead of data ingestion and movement. Azure Storage with Data Lake Gen2 is a supported store for Synapse and [Log Analytics exported data can be queried via Synapse Spark](https://techcommunity.microsoft.com/t5/azure-monitor/how-to-analyze-data-exported-from-log-analytics-data-using/ba-p/2547888). + +- [Azure Synapse Studio](https://docs.microsoft.com/azure/synapse-analytics/overview-what-is#unified-experience) unites ingestion and querying tasks. + - Source data, including Cosmos DB Analytical Store data and Log Analytics Export data, are queried and processed in order to support business intelligence and other aggregated analytical use cases. + +[![Azure Synapse Analytics](https://docs.microsoft.com/azure/synapse-analytics/media/overview-what-is/synapse-architecture.png "Azure Synapse Analytics")](./Data-Platform.md) + +### Design Recommendations + +- Ensure analytical workloads do not impact transactional application workloads to maintain transactional performance. + +**Application Analytics** + +- Use Azure Synapse Link with Cosmos DB Analytical Store to perform analytics on Cosmos DB operational data by creating an optimized data store which will not impact transactional performance. + - Enable [Azure Synapse Link](https://docs.microsoft.com/azure/cosmos-db/configure-synapse-link#enable-synapse-link) on Azure Cosmos DB accounts. + - [Create a container enabled for Analytical Store](https://docs.microsoft.com/azure/cosmos-db/configure-synapse-link#create-analytical-ttl), or [enable an existing Container for Analytical Store](https://docs.microsoft.com/azure/cosmos-db/configure-synapse-link#update-analytical-ttl). + - [Connect the Azure Synapse workspace to the Cosmos DB Analytical Store](https://docs.microsoft.com/azure/synapse-analytics/synapse-link/how-to-connect-synapse-link-cosmos-db) to enable analytical workloads in Azure Synapse to query Cosmos DB data. Use a connection string with a [read-only Cosmos DB key](https://docs.microsoft.com/azure/cosmos-db/database-security?tabs=sql-api#primary-keys). + +- Prioritize Cosmos DB Analytical Store with Azure Synapse Link instead of using the Cosmos DB Change Feed to maintain an analytical data store. + - The Cosmos DB Change Feed may be suitable for very simple analytical scenarios. + +**AIOps and Operational Analytics** + +- Create a single Azure Synapse workspace with linked services and data sets for each source Azure Storage account to which operational data from AlwaysOn resources are sent to. + +- Create a dedicated Azure Storage account and use it as the workspace primary storage account to store the Synapse workspace catalog data and metadata. Configure it with hierarchical namespace to enable Azure Data Lake Gen2. + - Maintain separation between the source analytical data and Synapse workspace data and metadata. + - Do not use one of the regional or global Azure Storage accounts to which operational data is sent to. + +--- + +|Previous Page|Next Page| +|--|--| +|[Application Platform](./App-Platform.md)|[Health Modeling and Observability](./Health-Modeling.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Deployment-Testing.md b/docs/design-methodology/Deployment-Testing.md new file mode 100644 index 00000000..c3a28c44 --- /dev/null +++ b/docs/design-methodology/Deployment-Testing.md @@ -0,0 +1,551 @@ +# Deployment and testing + +Application outages are often caused by failed deployments or erroneous releases, which is precisely why the design of Continuous Integration and Continuous Deployment (CI/CD) pipelines to encompass deployment and testing methodologies plays such a critical role in the overall reliability of an AlwaysOn application. + +Moreover, deployment and testing should not be constrained to the delivery of planned application updates, but instead should form the basis for how all application and infrastructure operations are conducted to ensure consistent outcomes for mission-critical workloads. This variety of deployment contexts covering a wide gamut of activities results in a frequent, and often daily, deployment cadence. There is therefore a clear need for the design of CI/CD pipelines to exhibit maximum reliability, since they perform a critical operational function for an AlwaysOn application through: + +- Rigorous Pre-Release Testing: updates should not introduce defects, vulnerabilities, or anything that will jeopardize application health. +- Transparent Deployments: all clients and users should be able to continue application interaction without interruption using a zero-downtime deployment approach. +- Highly Available Operations: deployment and testing processes must themselves be highly available to support overall application reliability. +- End-to-End Automation: manual intervention in the technical execution of deployment and testing operations represents a significant reliability risk. +- Consistent deployment process: Same application artifacts and processes are used to deploy the infrastructure and application code across different environments. + +This section will therefore focus on how to eradicate downtime and maintain application health for deployment operations, providing key considerations and recommendations intended to inform the design of optimal CI/CD pipelines for an AlwaysOn application. + +- [Application Environments](#application-environments) +- [Ephemeral Blue/Green Deployments](#ephemeral-bluegreen-deployments) +- [Infrastructure-As-Code Deployments](#infrastructure-as-code-deployments) +- [DevOps Tooling](#devops-tooling) +- [Branching Strategy](#branching-strategy) +- [Container Registry](#container-registry) +- [Secret Management](#secret-management) +- [Testing](#testing) +- [AI for DevOps](#ai-for-devops) + +## Application Environments + +Before considering deployment processes and associated tooling, it is important to evaluate the application environments required to appropriately validate and stage deployment operations. These environment types will ultimately differ in terms of requisite capabilities and longevity, with some environments reflecting production on a permanent basis, whilst others may be short lived with a reduced level of complexity. Moreover, these environments will be staged during the engineering and release cycle in order to ensure deployment operations are fully tested before released into the production environment. + +This section will therefore explore key considerations and recommendations for application environments in an mission-critical context, covering key design objectives such as developer agility and separation of concerns. + +### Design Considerations + +**Development Environments** + +- Development environments will typically not share the same reliability, capacity, and security requirements as the production AlwaysOn environment. + +- Given the reduced scale, reliability, and security requirements of a development environment, they can more easily coexist within a single subscription. + +- It is likely that engineering teams will require multiple development environments to support the completion of parallel feature development. + +- Development environments must be available when required, but need not exist permanently and typically only exist for short periods of time. + - Keeping environments short lived saves costs, and prevents configuration drift from the code base. + - Development environments often share the lifecycle of a feature branch. + +- Development environments can also encompass the development of Infrastructure-as-Code (IaC) artifacts such as Terraform or Azure Resource Manager (ARM) templates. + +**Staging Environments** + +- Staging environments can vary depending on their intended function within the release cycle. + - They will typically more closely resemble the requirements of the production environment for reliability, capacity, and security. + +- Staging environments can be used for a variety of purposes, but will typically focus on testing and validation, with a multitude of test cycles considered. + - Load and performance testing. + - Chaos testing. + - Integration and build verification testing. + - User acceptance testing. + - Security and penetration testing. + +- Different test functions can be performed within the same environment, and in some cases this will be required. + - For example, for chaos testing to provide meaningful results, the application must first be placed under load to be able to understand how the application responds to injected faults. + - Chaos testing and load testing are therefore typically performed in parallel. + +- During early development cycles and in absence of a production load, a constant synthetic user load against an environment provides realistic metrics and with that valuable health modeling input. + +**Production Environments** + +- Some applications may consider multiple different production environments to cater to different clients, users, or business functionality. + +### Design Recommendations + +- Ensure all environments reflect the production environment as much as possible, with simplifications applied for lower environments as necessary. + +- Separate production environments from lower environments into a dedicated subscription. This helps to ensure resource utilization in lower-environments does not impact production quotas, and to provide a clear governance boundary and separation of concerns. + - Depending on the scale requirements of the application, multiple production subscriptions might be needed to serve as scale-units. + +- Separate development environments within a distinct subscription context, with all development environments sharing the same subscription. + - Ensure that there is an automated process to deploy code from a feature branch to a development environment. + - Treat development environments as ephemeral, sharing the lifecycle of the associated feature branch. + +- Define the number of staging environments and their purpose within the development and release cycle. + +- Avoid sharing components between environments. + - Possible exceptions are downstream security appliances like firewalls, or source locations for synthetic test data. + +- Ensure at least one staging environment is fully reflective of production to enable production-like testing and validation. + - Capacity within this pre-production environment can flex based on the execution of test activities. + - Use of a constant synthetic user load generation is required to provide a realistic test bed for changes on one of the pre-production environments. + - The AlwaysOn [foundational-online](https://github.com/Azure/AlwaysOn-Foundational-Online) reference implementation provides an example [user load generator](https://github.com/Azure/AlwaysOn-Foundational-Online/src/testing/userload-generator/README.md). + +[![AlwaysOn Azure Subscription Organization](/docs/media/alwayson-subscription-organization.png)](./Deployment-Testing.md) + +## Ephemeral Blue/Green Deployments + +To achieve zero interruptions while performing deployments to an AlwaysOn application, it is strongly recommended to adopt a blue/green deployment approach for production environments in conjunction with ephemeral resources. This allows new application code or resources to be deployed and tested in a new parallel environment, with traffic only transitioned once ready in a phased process before subsequently decommissioning the old environment. + +### Design Considerations + +- A blue/green deployment approach requires a minimum of 2 identical deployment contexts, where an existing deployment (blue) is actively serving user traffic, and a new secondary deployment (green) is established and made ready to receive traffic. + - Once the new deployment is completed and tested, traffic is gradually switched from the blue deployment to the green. + - If the load transfer is successful, that new deployment becomes the new 'active' production environment and the old, now 'inactive' deployment can be decommissioned. + - If there are issues within the new deployment environment, the deployment can be aborted and traffic can either remain in the old 'active' deployment, or be directed back to it. + - This provides a clear fall back plan and minimizes potential for reliability issues, such as having to cut production traffic to rectify faulty deployment issues. + +- A blue/green deployment can be implemented at either an application level or at the infrastructure level. + +- **Application Level**: New code is deployed to a staging location within the existing infrastructure. + - For example, Azure App Service provides this capability through secondary deployment slots which can be swapped after the deployment, while in AKS this can be achieved using a separate pod deployment on each node and updating the the service definition. + - This approach incurs less costs and is faster than a full infrastructure level blue/green deployment. + +- **Infrastructure Level**: A deployment containing all infrastructure **and** application components within a deployment scope. + - Completely new Azure resources, such as the AKS Cluster and Event Hub in the case of the foundational reference implementation, are established before subsequently deploying application code to the new infrastructure. When the new deployment has been fully tested and validated, traffic can be transitioned through a phased process, and the old infrastructure can then be decommissioned when appropriate. + - The advantage of this approach is that all changes within the deployment scope are fully deployed and tested in production before traffic is transitioned between the environments. Also, this approach provides a much safer approach for any infrastructure-level changes within a release. + - Individual deployments may take longer to complete using this methodology since it takes longer to deploy the infrastructure and application than deploying the application in isolation. + - There is an additional cost associated with an infrastructure based approach, since two deployment contexts must exist side by side until the deployment is fully complete. + +> This infrastructure blue/green approach allows all changes within a deployment scope, both to the infrastructure and application, to be achieved with zero downtime and maximum confidence. In addition all compatibilities with downstream dependencies such as Azure platform, resource providers or IaC modules can be validated. + +- The blue and green environments can be long living and reused for each deployment, or treated as is recommended to deploy a new infrastructure for each new deployment. + +- At an infrastructure level, the orchestration of user traffic between the blue and green environments can be controlled using a global load balancer, such as Azure Front Door. + +### Design Recommendations + +- Utilize a blue/green deployment approach to release all production changes. + - Prioritize an infrastructure level approach in order to achieve zero-downtime deployments and provide one consistent deployment strategy for any kind of changes (application-level and/or infrastructure-level). + - Use a global load balancer to orchestrate the automated transition of user traffic between the blue and green environments. + - Add a green backend endpoint and using a low traffic volume/weight, such as 10%. + - After verifying that the low traffic volume on green is being managed as expected with a maintained application health, the traffic can be gradually increased in increments until it reaches 100%. + - Whilst increasing traffic, a short ramp-up period should be applied to catch faults which may not come to light immediately. + - Once all traffic has been migrated to the new green environment, remove the blue backend from global load balancer service. + - Decommission the old and inactive blue environment. + - Repeat the process for the next deployment with blue and green reversed. + +- While blue and green environments can be reused, it is strongly recommended to deploy new infrastructure for each new deployment. + - Treat each regional deployment stamp as ephemeral with a lifecycle tied to that of a single release. + +- Decommission the old and inactive 'blue' environment, ensuring that any connections established while this environment was active are also closed and any queues are drained before removing associated resources. + - This will save costs relative to maintaining secondary production infrastructure and will ensure new environments are completely free of configuration drift. + +- To prevent downtime, the process to control the transition of traffic between environments should be fully automated. + +- Phase the transition of traffic between the blue/green environments to minimize client and user exposure whilst confidence is established in the new environment. + +- Allow for a short ramp-up period when transitioning traffic between blue/green environments in order to catch faults which may not come to light immediately. + +#### Zero-Downtime Deployment Reference + +Achieving zero-downtime deployments is a fundamental goal of an AlwaysOn application, but is ultimately a complex issue which requires significant engineering investment and greatly influences the overall design. It is therefore critical to invest effort up-front to define and plan deployment processes, to drive key design decisions such as whether to treat resources as ephemeral. + +The [foundational-online](https://github.com/Azure/AlwaysOn-Foundational-Online) and [foundational-connected](https://github.com/Azure/AlwaysOn-Foundational-Connected) reference implementations serve as practical examples for these concepts and recommendations, to establish an optimized zero-downtime deployment approach as represented in the illustration below. + +[![Zero-Downtime DevOps Pipeline Reference](/docs/media/alwayson-zero-downtime-pipeline.png "Zero-Downtime DevOps Pipeline Reference")](./Deployment-Testing.md) + +## Infrastructure-As-Code Deployments + +The recommended infrastructure-level blue/green deployment approach is underpinned by the principle of Infrastructure-as-Code (IaC), with fully automated and consistent infrastructure deployments. + +### Design Considerations + +- The principle of Infrastructure-as-Code (IaC) treats infrastructure definitions as source code that is version controlled alongside other application artifacts. + - Utilizing IaC ensures code consistency across environments and eliminates the risk of human error during automated deployments, as well as providing traceability and rollback. + +- Typically an AlwaysOn IaC repository has two resource definitions: + - Global Resources: those that are deployed once within the solution, such as Azure Front Door and Azure Cosmos DB. + - Regional (*Stamp*) Resources: those that are deployed + +### Design Recommendations + +- Apply the concept of 'Infrastructure-as-Code' (IaC) and ensure all Azure resources are defined in declarative templates and maintained in a source control repository from where they can be deployed automatically using CI/CD pipelines. + +- Define infrastructure artifacts as declarative templates, and not as imperative scripts. + +- Ensure the deployment of both infrastructure and application components are fully automated. + +- Prohibit manual operations against Production as well as lower environments. Only exception should be fully independent developer environments. + +## DevOps Tooling + +There are a myriad of different products and services which can provide the necessary DevOps capabilities to effectively deploy and manage an AlwaysOn application; Microsoft provides two Azure-native toolsets through GitHub _Actions_ and Azure DevOps (ADO) _Pipelines_. + +The appropriate and effective use of leveraged deployment tooling is critical to ensure overall reliability for an AlwaysOn application, particularly since DevOps processes provide such a significant function within the overall application design. For example, failover and scale operations may depend on automation provided by DevOps tooling. Deployment tooling must therefore be implemented in a reliable and highly available manner, with engineering teams understanding the application impact if the deployment service, or parts of it, become unavailable. + +This section will therefore focus on the optimal use of GitHub Actions and Azure DevOps Pipelines and decision factors influencing the optimal selection of DevOps tooling. + +### Design Considerations + +- The capabilities of GitHub _Actions_ and Azure DevOps (ADO) _Pipelines_ are largely overlapping. + +- Different technologies can be used simultaneously to utilize the best features different technologies in parallel. + - A common approach is to hold code repositories in GitHub.com or GitHub AE whilst using the deployment pipelines in ADO. + - It should be noted that the use of multiple technologies adds an element of complexity and impacts the risk landscape. + +**Azure DevOps Pipelines** + +- Azure DevOps Pipelines provides highly mature deployment pipelines, including features like gates and approvals. + +- ADO instances are hosted in a single Azure region which is chosen [at organization-level](https://docs.microsoft.com/azure/devops/organizations/accounts/change-organization-location?view=azure-devops). + - Data is replicated across regions but only for Disaster Recovery purposes. + - Hosted build agents are utilized from the same region as the ADO instance. + +- In the context of the AlwaysOn aspiration for maximum reliability, the dependency on a single Azure region represents an operational risk. + - For example, consider a scenario where traffic is spread over West Europe and North Europe, with West Europe hosting the ADO instance. If West Europe experiences an outage, the ADO instance would also be effected. While North Europe would automatically now handle all application traffic, the ability to deploy additional scale-units to North Europe, in order to provide a consistent failover experience, would be prohibited which may result in a severely degraded application experience until the issue is resolved. + +**GitHub Actions** + +- GitHub.com is well known and adopted by developers and used for many open source projects. + +- GitHub.com instances are also hosted in a single Azure region. + - Data is replicated across regions but only for Disaster Recovery purposes. + +- A private and dedicated [GitHub AE](https://docs.github.com/en/github-ae@latest/admin/overview/about-github-ae) offering is available in a limited public preview. + +- GitHub Actions is still a fairly new service, but is already well suited for build-related tasks (Continuous Integration). + +- GitHub Actions is less mature when it comes to deployment tasks (Continuous Deployment). + - Templating and reuse of pipeline steps is limited. + - Gates and approval options are limited. + - Missing options to control pipeline execution, such as the exclusion of specific stages. + +### Design Recommendations + +- Define an availability SLA for deployment tooling and ensure alignment with broader application reliability requirements. + +- In a multi-region scenario with an active-passive or active-active application deployment configuration, ensure that failover orchestration and scaling operations can continue to function even if the primary region hosting deployment toolsets becomes unavailable. + +## Branching Strategy + +Branching strategies are a fundamental aspect of application source control, and while there are many valid approaches to apply branching, there are several key aspects that should be considered in the context of an AlwaysOn application scenario to ensure maximum reliability for mission-critical workloads. + +### Design Considerations + +- Developers will carry out their work in _feature/*_ and _fix/*_ branches and these are the entry points for changes. + +- Restrictions can be applied to branches as part of the branching strategy, such as only allowing administrators to create release branches, or enforcing naming conventions for branches. + +- There might be rare occasions where a hotfix is urgently required and applied directly into an existing production environment. Examples of potential hotfixes includes critical security updates or remediation of issues breaking the user experience. + +release branch with a subsequently deployment to the Production environment. Examples of hotfixes may include critical security updates or issues breaking the user experience. Typically, these hotfixes are created on a _fix/*_ branch and merged into the release branch. It is essential that the change is brought into _main_ as soon as practical so that is part of future releases and also avoids any reoccurrence of the issue. This process must only be used for small changes addressing urgent issues and with restraint. + +### Design Recommendations + +- Prioritize the use of [GitHub for source control](https://docs.github.com/en/code-security/supply-chain-security/managing-vulnerabilities-in-your-projects-dependencies/about-managing-vulnerable-dependencies). + +> Create a branching strategy that details _feature_ work and _releases_ as a minimum, using branch policies and permissions to ensure the strategy is appropriately enforced. + +- When feature branch changes are pushed to _origin_, trigger an automated testing process to validate the legitimacy of code contributions before any Pull Request (PR) can be completed. + - Ensure any PR requires the review of at least one other team member before merging. + +- It is recommended to treat the _main_ branch as a continuously forward moving and stable branch, primarily used for integration testing. + - Ensure changes are only made to _main_ via PRs, using a branch policy to prohibit direct commits. + - Every time a PR is merged into _main_, it should automatically kick off a deployment against an integration environment. + - _main_ should be considered stable and safe to create a release from at any given time. + +- It is recommended to consider the use of dedicated _release/*_ branches, created from the _main_ branch and used to deploy to Production environments. + - _release/*_ branches should remain in the repository and can be used to patch a release. + +- Define and document a hotfix process and apply it only when needed. + - Create hotfixes in a _fix/*_ branch for subsequent merging into the release branch and deployment to production. + - Ensure any changes are brought into _main_ as soon as practical so that they are reflected in all future releases to avoid reoccurrence of the issue. + - A hotfix process should only be used with restraint for small changes addressing urgent issues; almost all operational issues should follow the standard operating procedure and CI/CD DevOps processes. + +## Container Registry + +Container registries are a key aspect for any containerized application, providing hosting for container images deployed to container runtime environments, such as AKS. There is a wide variety of container registry technologies available that predominantly rely on the Docker-provided format and standards for both push and pull operations. + +This section will therefore examine the optimal configuration of container registries, focusing on the native Azure Container Registry service, while also exploring the trade-offs associated with centralized and federated deployment models. + +### Design Considerations + +- Since most container registry solutions rely on the Docker-provided format and standards for both push and pull operations, they are are broadly compatible and mostly interchangeable. + +- Container registries can sometimes be deployed either as a centralized service that is shared and consumed by numerous applications within an organization, or a separate application component dedicated to a specific application workload. + +- Some application scenarios will require public container images be replicated within a private container registry to limit egress traffic, increase availability, or avoid potential throttling. + +**Public Registries - Docker Hub** + +- Container images stored on Docker Hub, or other public registries, exist outside of Azure and a given virtual network. This is not necessarily a problem, but in certain scenarios can lead to a variety of potential issues where service unavailability, throttling and data exfiltration are concerned. + +**Azure Container Registry (ACR)** + +- [Azure Container Registry (ACR)](https://azure.microsoft.com/services/container-registry/) provides an Azure-native service with a range of features including geo-replication, Azure AD authentication, automated container building, and patching using ACR tasks. + +- ACR supports High Availability through [Geo-replication](https://docs.microsoft.com/azure/container-registry/container-registry-geo-replication#considerations-for-high-availability) to multiple configured regions, providing resiliency against regional outage. If a region becomes unavailable, the other regions will continue to serve image requests, and when the region returns to health the ACR will recover and replicate changes to it. + - This capability also provides registry colocation within each configured region, reducing network latency and cross-region data transfer costs. + +- Within Azure regions which provide Availability Zone support, the [Premium ACR tier supports Zone Redundancy](https://docs.microsoft.com/azure/container-registry/zone-redundancy) to protect against zonal failure. + +- [Tagged ACR images are mutable by default](https://docs.microsoft.com/azure/container-registry/container-registry-image-lock#scenarios), meaning that the same tag can be used on multiple images pushed to the registry. + - In production scenarios, this may lead to unpredictable behavior which could impact application uptime. + +- ACR supports [locking an image version or a repository](https://docs.microsoft.com/azure/container-registry/container-registry-image-lock) to prevent changes or deletes. + - Image Locking mitigates multiple failure scenarios and also protects against a previously-deployed image *version* being changed in-place, which would introduce the risk that same-version deployments may have different results (before and after such a change). + - Locking Container Images does not protect against the ACR instance being deleted, but [Azure Resource Locks](https://docs.microsoft.com/azure/azure-resource-manager/management/lock-resources) can be used to achieve this. + +- ACR in Premium tier also offers support to restrict a container registry to a given set of virtual networks and subnets through [Private Endpoints](https://docs.microsoft.com/azure/container-registry/container-registry-private-link). + +### Design Recommendations + +- For AlwaysOn application scenarios, employ container registry instances that are dedicated to the application workload. + - Avoid taking a dependency on a centralized service unless availability and reliability requirements are in full alignment with the application. + +- When using container registries outside Azure, ensure that the provided SLA is aligned with the reliability and security targets. + - Take special note of throttling limits, e.g. when relying on Docker Hub. + +- Leverage Azure Container Registry to host container images. + +**Azure Container Registry (ACR)** + +- Treat container registries as 'global resources' with a sustained lifecycle ('long-living'). + - Consider a single global container registry per environment, such as the use of a global production registry. + +- Configure geo-replication to all considered deployment regions in order to remove regional dependencies and optimize latency. + - Images should be hosted geographically as close as possible to the consuming compute resources, within the same Azure regions. + - Prioritize regions with Availability Zone support to take advantage of zonal redundancy capabilities. + +- Use Azure AD integrated authentication to push and pull images instead of relying on access keys. + - For optimal security, fully disable the use of the admin access key. + +## Secret management + +Secret management is a key technical domain in the context of both security and reliability, since the secret management solution for an AlwaysOn application must provide requisite security and also offer an appropriate level of availability to align with maximum reliability aspirations. + +### Design considerations + +- There are a multitude of key and secret management solutions available that can be leveraged on Azure. + +- Azure Key Vault provides a fully-managed Azure-native PaaS solution. + - Provides native integration with Azure services out-of-the-box. + - Supports Availability Zone deployments and multi-region redundancy. + - Offers direct integration with Azure AD for authentication and authorization. + +- Many Azure services already support Azure AD authentication instead of relying on connection strings / keys. Doing so greatly reduces the need to managed secrets in the first place. + +There are three common approaches applied to define at what point secrets must be read from the selected secret store and injected into the application: + +**Deployment-Time Retrieval** + +- Retrieving secrets at deployment time provides the advantage that the secret management solution only need to be available at deployment time, since there are no direct dependencies after this point. + - For example, injecting secrets as environment variables into a Kubernetes deployment or into a Kubernetes secret. + +- Only the deployment service principal needs to be able to access secrets, which simplifies RBAC permissions within the secret management system. + - It does, however, introduce additional RBAC considerations within DevOps tooling around controlling service principal access and the application in terms of protecting retrieved secrets. + +- This method introduces a trade-off since the security benefits of the secret management solution are not being utilized since this design approach relies solely on the access control within the application platform to keep secrets safe. + +- Secret updates or rotation will require a full redeployment in order to take effect. + +**Application start-up retrieval** + +- Retrieving and inject secrets at application start up provides the benefit that secrets can more easily be updated or rotated. + - A restart of the application is required to fetch the latest value. + +- This method ensures that secrets do not need to be stored on the application platform but can be held in memory only. + - For AKS, available implementations for this approach include the [CSI SecretStore driver for KeyVault](https://azure.github.io/secrets-store-csi-driver-provider-azure/) and [akv2k8s](https://akv2k8s.io/). + - A native Azure solution [Azure Key Vault referenced App Settings](https://docs.microsoft.com/azure/app-service/app-service-key-vault-references) is also available. + +- The disadvantage of this approach is that it creates a runtime dependency to the secret management solution. + - If the secret management solution experiences an outage, application components already running **may** be able to continue serving requests, however, any restart or scale-out operations will likely result in failure. + +**Runtime retrieval** + +- Retrieving secrets at runtime from within the application itself serves as the most secure approach since even the application platform never has access to secrets. + +- Application components require a direct dependency and a connection to the secret management system. + - This makes it harder to test components individually and usually requires the use of an SDK. + +- The application itself needs to be able to authenticate to the secret management system. + - For AKS the latter can be achieved using [Pod-managed Identities](https://docs.microsoft.com/azure/aks/use-azure-ad-pod-identity), but that is currently (as of August 2021) still in preview. + +### Design recommendations + +- Where possible, use Azure AD authentication to connect to other services instead of using connection strings or keys. + - Use this in conjunction with Azure Managed Identities to remove the need for any secrets to be stored on the application platform. + +- Use Azure Key Vault to store all application secrets. + +- Azure Key Vault instances should be deployed as part of a regional stamp to mitigate the potential impact of a failure to a single deployment stamp. + - AlwaysOn 'global' resources, such as Front Door (for certificate storage, if required), should leverage a separate Azure Key Vault instance dedicated to global resources, rather than using one of the regional Key Vault instances. + +- Use Managed identities instead of service principals to access Key Vault whenever possible. + +- Secrets should be retrieved at application start up, not during deployment time or at runtime. + +- Implement coding patterns so when an authorization failure occurs at runtime, secrets are re-retrieved. + +- Apply a fully automated key-rotation process that runs periodically within the solution. + - Use [key near expire notification](https://docs.microsoft.com/azure/key-vault/keys/how-to-configure-key-rotation#configure-key-near-expiry-notification) to get alerted on upcoming expiration. + +## Testing + +As previously stated, testing is a fundamental activity for any an AlwaysOn solution, to fully validate the health of both the application code and infrastructure. More specifically, to satisfy desired standards for reliability, performance, availability, security, quality, and scale, testing must be well defined and applied as a core component of the application design and DevOps methodologies. + +Testing is ultimately a key concern for both the local developer experience ("[Inner Loop](https://docs.microsoft.com/dotnet/architecture/containerized-lifecycle/design-develop-containerized-apps/docker-apps-inner-loop-workflow)") and the complete DevOps lifecycle ("[Outer Loop](https://docs.microsoft.com/dotnet/architecture/containerized-lifecycle/docker-devops-workflow/docker-application-outer-loop-devops-workflow)"), which captures when developed code begins release pipeline processes on its journey to a production environment. + +The scope of this section focuses on testing conducted within the outer loop for a product release, considering a variety of test scenarios, such as unit, build, static, security, integration, regression, UX, performance, capacity and failure injection (chaos). The order of conducted tests is also a critical consideration due to various dependencies, such as the need to have a running application environment. + +### Design considerations + +- With high degrees of deployment automation, automated testing is essential to validate application or infrastructure changes in a timely and repeatable manor. + +- The purpose of testing is ultimately to detect errors and issues before they reach production environments, and there are a variety of methods which are required to holistically achieve this goal. + +**Unit testing** + +- Unit testing is intended to confirm that application business logic works as expected. + - Improve confidence in the overall effect of code changes. + +- Unit testing is typically considered as part of the Inner Loop and as such is not a primary focus for this section. + +**Smoke testing** + +- Smoke testing is used to identify whether infrastructure and application components are available and act as expected. + - A smoke test focuses on functionality rather than performance under load. + - Typically only a single virtual user session is tested. + + - Common smoke testing scenarios include; interrogating the HTTPS endpoint of a web application, querying a database, and simulating a user flow in the application. + + - The outcome of a smoke test should be that the system responds with expected values and behavior. + +**UI testing** + +- UI Testing validates that application user interfaces are deployed and functioning as expected. + - UI testing is similar to smoke testing but it is focused on user interface interactions. + +- UI automation tools can and should be used to drive automation. + - During a UI test, a script will mimic a realistic user scenario and follow a series of steps to execute actions and achieve an intended outcome. + +**Load testing** + +- Load testing is designed to validate scalability and application operation under load through rapid and/or gradual increase in application test load, until a threshold/limit is reached. + - Load tests are typically designed around a particular user flow or scenario, in order to verify that application requirements are satisfied under a defined load. + +- Azure services have different soft and hard limits associated with scalability, and load testing can reveal if a system faces a risk of exceeding them during the expected production load. + +- Load testing can be used to fine-tune auto-scaling capabilities for services that provide automated scalability (i.e. to set appropriate measured thresholds). + - For services that do not provide native auto-scaling, established automated operational procedures can also be fine-tuned through load testing. + +**Stress testing** + +- Stress testing is a type of negative testing which applies activities aimed at overloading existing resources in order to understand where solution limits exist, and to ensure the systems ability to recover gracefully. + +- During a stress tests it is essential to monitor all components of the system in order to identify potential bottlenecks. + +- Every component of the system unable to appropriately scale can turn into a limitation, such as active/passive network components or databases. + - It is important to understand their limits so that effort can be applied to mitigate potential impact. + +- Unlike load testing, stress tests do not adhere to a realistic usage pattern, but aim to identify performance and scale limits. + +- An alternative approach is to limit (or scale down) the computing resources of the system and monitor how it behaves under load and whether it is able to recover. + +**Performance testing** + +- Performance testing combines aspects of *load* and *stress testing* to validate performance under load, and establish benchmark behaviors for application operation. + +**Failure injection (chaos) testing** + +- Chaos testing introduces artificial failures to the system to validate how the system reacts and the effectiveness of resiliency measures, operational procedures and mitigations. + +- An AlwaysOn application should be resilient to infrastructure and application failures, so introducing faults in the application and underlying infrastructure and observing how the application behaves is essential to achieve confidence in the solutions redundancy mechanisms and validate that it can indeed operate as an 'always on' application. + - Shutting down infrastructure components, purposely degrading performance, or introducing application faults are examples of test scenarios which can be used to verify that the application is going to react as expected in situations when they occur for real. + +- [Azure Chaos Studio](https://azure.microsoft.com/services/chaos-studio/) provides an Azure-native chaos experimentation suite of tools to easily conduct chaos experiments and inject faults within Azure services and application components. + - Provides built-in chaos experiments for common fault scenarios, providing a growing set of 'behind the curtain' experiments for underlying and abstracted components of Azure services. + - Supports custom experiments targeting infrastructure and application components. + +**Security (penetration) testing** + +- Penetration testing is used to ensures that an application and its environment satisfies an expected security posture. + +- Penetration tests will probe the application an environment for security vulnerabilities. + +- Security testing can encompass the end-to-end software supply chain and package dependencies, with scanning and monitoring for known Common Vulnerabilities and Exposures (CVE). + +### Design recommendations + +- All testing of both infrastructure and application components should be fully automated to ensure consistency. + +- All test artifacts should be treated as code and maintained within the source control system and version controlled along with other application code artifacts. + +- The results of the tests should be captured and analyzed as both individual test results and aggregated for assessing trends over time. + - Test results should be continually evaluated for accuracy and coverage. + +- The availability of test infrastructure should be aligned with the SLA for deployment and testing cycles. + +- Use PaaS CI/CD orchestration platforms, such as Azure DevOps or GitHub Actions, to orchestrate and execute tests where possible. + +- Execute smoke tests as part of every deployment. + +- Run extensive load tests, along with stress and chaos testing, as part of every deployment to validate application performance and operability is maintained. + - Use load profiles that are reflective of real peak usage patterns. + - Run chaos experiments and failure injection tests at the same time as load tests. + +- If database interactions are required for load or smoke tests (i.e. to create new records), use test accounts with reduced privileges and make test data separable from real user content. + +- Tests with shorter execution times should generally run earlier in the cycle where possible to increase testing efficiency. + +- Scan and monitor the end to end software supply chain and package dependencies for known CVEs. + - Use [Dependabot](https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/about-dependabot-version-updates) for GitHub repositories to ensure the repository automatically keeps up-to-date with the latest releases of packages and applications it depends on. + +## AI for DevOps + +AIOps methodologies can be applied within CI/CD pipelines to supplement traditional testing approaches, providing capabilities to detect likely regressions or degradations, and allowing deployments to be pre-emptively stopped to prevent potential negative impact. + +### Design Considerations + +- CI/CD pipelines and DeOps processes will expose a wide variety of telemetry for machine learning models, from test results and deployment outcomes, to operational data of test components from composite deployment stages. + - CI/CD pipelines will include various types of automated testing, such as unit, smoke, performance, load, and chaos tests. + +- Changes in a deployment will need to be stored in a manner suitable for automated analysis and correlation to deployment outcomes. + +- Traditional data processing approaches such as Extract, Transform, and Load (ETL) may not be able to scale throughput to keep up with growth of deployment telemetry and application observability data. + - Modern analytics approaches which do not require ETL and data movement, such as data virtualization, can be used to enable ongoing analysis by AIOps models. + +### Design Recommendations + +- Define what DevOps process data will be collected and how it will be analyzed. + - Expose application observability data from staged test environments and the production environment for analysis and correlation within AIOps models. + - Gather deployment telemetry from DevOps processes, such as test execution metrics and time series data of changes within each deployment. + +- Adopt the [MLOps Workflow](https://azure.microsoft.com/services/machine-learning/mlops/). + +- Develop analytical models that are context-aware and dependency-aware to provide predictions along with automated feature engineering to address schema and behavior changes. + +- Operationalize models by registering and deploying the best trained models within deployment pipelines. + +--- + +|Previous Page|Next Page| +|:--|:--| +[Health Modeling and Observability](./Health-Modeling.md)|[Networking and Connectivity](./Networking.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Design-Areas.md b/docs/design-methodology/Design-Areas.md new file mode 100644 index 00000000..701a8cb4 --- /dev/null +++ b/docs/design-methodology/Design-Areas.md @@ -0,0 +1,102 @@ +# Critical design areas + +The 8 design areas below represent the architecturally significant topics which must be discussed and designed for when defining a target AlwaysOn application architecture. In this regard, this section of the repository is intended to provide prescriptive and opinionated guidance to support readers in designing an AlwaysOn solution. + +- [Application Design](./App-Design.md) +- [Application Platform](./App-Platform.md) +- [Data Platform](./Data-Platform.md) +- [Health Modeling](./Health-Modeling.md) +- [Deployment and Testing](./Deployment-Testing.md) +- [Networking and Connectivity](./Networking.md) +- [Security](./Security.md) +- [Operational Procedures](./Operational-Procedures.md) + +These eight critical design areas will be explored at length within ensuing pages, for which critical review considerations and design recommendations are provided along with their broader design impact across other areas. Ultimately, the design areas are interrelated and decisions made within one area can impact or influence decisions across the entire design, so readers are encouraged to use the provided design guidance to navigate the key design decisions. + +[![AlwaysOn Design Areas](/docs/media/alwayson-design-areas.png "AlwaysOn Design Areas")](./Design-Areas.md) + +## Reference Architecture + +An AlwaysOn application architecture is defined by the various design decisions required to ensure both functional and non-functional business-requirements are fully satisfied. The target AlwaysOn architecture is therefore greatly influenced by the relevant business requirements, and as a result may vary between different application contexts. + +The image below represents a target technical state recommended for mission-critical applications on Azure. It leverages a reference set of business requirements to achieve an optimized architecture for different target reliability tiers. + +[![AlwaysOn Online Foundational Reference Architecture](/docs/media/alwayson-architecture-foundational-online.png "AlwaysOn Online Foundational Reference Architecture")](./Design-Areas.md) + +> The [foundational-online](https://github.com/Azure/AlwaysOn-Foundational-Online) and [foundational-connected](https://github.com/Azure/AlwaysOn-Foundational-Connected) reference implementations provide solution orientated showcases for the AlwaysOn design methodology, demonstrating how this architecture pattern can be implemented alongside the operational wrappers required to maximize reliability and operational effectiveness. + +## Cross Cutting Concerns + +There are several critical cross-cutting themes which traverse the 8 design areas and are contextualized below for subsequent consideration within each design area. + +### Scale limits + +Various [limits and quotas within the Azure platform](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits) may have a significant bearing on large AlwaysOn application scenarios and must be appropriately considered by the target architecture. + +> Limits and quotas may change as Azure seeks to further enhance the platform and user experience. + +- Leverage subscriptions as scale units, scaling out resources and subscriptions as required +- Employ a scale unit approach for resource composition, deployment, and management +- Ensure scale limits are considered as part of capacity planning +- If available, use data gathered about existing application environments to explore which limits might be encountered + +### Automation + +Maximize reliability and operability through the holistic automation of all deployment and management activities. + +- Automate CI/CD deployments for all application components +- Automate application management activities, such as patching and monitoring +- Use declarative management semantics over imperative +- Prioritize templating over scripting; only use scripting when it is not possible to use templates + +### Azure roadmap alignment and regional service availability + +Align the target architecture with the Azure platform roadmap to inform the application trajectory, and ensure that required services and features are available within the chosen deployment regions. + +- Align with Azure engineering roadmaps and regional role out plans +- Unblock with preview services or by taking dependencies on the Azure platform roadmap +- Only take a dependency on committed services and features; validate roadmap dependencies with Microsoft engineering product groups + +### Azure Landing Zone integration + +[Azure Landing Zones](https://github.com/azure/cloud-adoption-framework/ready/landing-zone/) provides prescriptive architectural guidance to define a reliable and scalable shared-service platform for enterprise Azure deployments with requisite centralised governance. + +AlwaysOn can integrate seamlessly within an Azure Landing Zone, and is deployable within both the *Online* or *Corp. Connected* Landing Zone formats as demonstrated within the image below. + +[![AlwaysOn and Landing Zone Integration](/docs/media/alwayson-landing-zones.gif "AlwaysOn Landing Zone Integration")](./Design-Areas.md) + +It is crucial to understand and identify in which connectivity scenario an AlwaysOn application requires since Azure Landing Zones support different landing zones archetypes. + +- In the context of an *Online* Landing Zone archetype, AlwaysOn operates as a completely independent solution, without any direct corporate network connectivity to the rest of the Enterprise-Scale architecture. The application will, however, be further safeguarded through the [*policy-driven management*]((https://github.com/Azure/Enterprise-Scale/wiki/How-Enterprise-Scale-Works#enterprise-scale-design-principles)) approach which is foundational to Enterprise-Scale, and will automatically integrate with centralized platform logging through policy. + - A *Online* deployment can only really consider a public AlwaysOn application deployment since there is no private corporate connectivity provided. + +- When deployed in a *Corp. Connected* Landing Zone context, the AlwaysOn application takes a dependency on the Enterprise-Scale platform to provide connectivity resources which allow for integration with other applications and shared services existing on the platform. This necessitates some transformation on-top of the *Online* integration approach, since some foundational resources are expected to exist up-front as part of the shared-service platform. More specifically, the AlwaysOn regional deployment stamp should not longer encompass an ephemeral Virtual Network or Azure Private DNS Zone since these will exist within the Enterprise-Scale *connectivity* subscription. + - A *Corp. Connected* deployment can consider both a public or private AlwaysOn application deployment. + +> The AlwaysOn reference implementations are fully aligned with the Azure Landing Zones architectural approach and are immediately deployable within an *Online* or *Connected* Landing Zone subscription. + +--- + +|Previous Page|Next Page| +|--|--| +|[AlwaysOn Design Principles](./Principles.md)|[Application Design](./App-Design.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Health-Modeling.md b/docs/design-methodology/Health-Modeling.md new file mode 100644 index 00000000..b0835375 --- /dev/null +++ b/docs/design-methodology/Health-Modeling.md @@ -0,0 +1,454 @@ +# Health modeling and observability + +Health modeling and observability are essential concepts to maximize reliability, which focus on robust and contextualized instrumentation and monitoring to gain critical insight into application health, promoting the swift identification and resolution of issues. + +Most business-critical applications are significant in terms of both scale and complexity and therefore generate high volumes of operational data which makes it extremely challenging to evaluate and determine optimal operational action. Health modeling ultimately strives to maximize observability by augmenting raw monitoring logs and metrics with key business requirements to quantify application health, driving automated evaluation of health states to achieve consistent and expedited operations. + +This design area will therefore focus on the process to define a robust health model, mapping quantified application health states through observability and operational constructs to achieve operational maturity. + +- [Layered Application Health](#layered-application-health) +- [Unified Data Sink for Correlated Analysis](#unified-data-sink-for-correlated-analysis) +- [Dashboarding](#dashboarding) +- [Automated Incident Response](#automated-incident-response) +- [Predictive Action and AIOps](#predictive-action-and-aiops) + +> There are ultimately three overarching levels of operational maturity which should be used as a reference when striving to maximize reliability. +> 1) *Detect* and respond to issues as they happen. +> 1) *Diagnose* issues that are occurring or have already occurred. +> 1) *Predict* and prevent issues before they take place. + +## Layered Application Health + +In order to build a health model it is first necessary to define what application health means in the context of key business requirements, quantifying ‘healthy’ and ‘unhealthy’ states in a layered and measurable format. More specifically, health definitions for each distinct application component should be captured in the context of a steady running state and aggregated according to application user flows in conjunction with key non-functional business requirements for performance and availability. The health states for each individual user flow can then subsequently be aggregated to form a meaningful representation of overall application health. Once established, these layered health definitions should be used to inform critical monitoring metrics across all system components and validate operational sub-system composition. + +> When defining what 'unhealthy' states represent for all levels of the application, it is important to distinguish between transient and non-transient failure states to qualify service degradation relative to unavailability. + +### Design Considerations + +- The process of modeling health is a top-down design activity that starts with an architectural exercise to define all user flows and map dependencies between functional/logical components, thereby implicitly mapping dependencies between Azure resources. + +- A health model is entirely dependent on the context of the solution it represents, and therefore cannot be solved 'out-of-the-box' since 'one size does not fit all'. + - Applications will differ in composition and dependencies + - Metrics and metric thresholds for resources must also be finely tuned in terms of what values represent healthy and unhealthy states, which is heavily influenced by encompassed application functionality and target non-functional requirements. + +- A layered health model enables application health to be traced back to lower level dependencies which helps to quickly root cause service degradation. + +- To capture health states for an individual component, that component's distinct operational characteristics must be understood under a steady state that is reflective of production load. Performance testing is therefore a requisite capability to define and continually evaluate application health. + +- Failures within a cloud solution may not happen in isolation. An outage in a single component may lead to several capabilities or additional components becoming unavailable. + - Such errors may not be immediately observable. + +### Design Recommendations + +- Define a measurable health model as a priority to ensure a clear operational understanding of the entire application. + - The health model should be layered and reflective of the application structure. + - The foundational layer should consider individual application components (i.e. Azure resources). + - Foundational components should be aggregated alongside key non-functional requirements to build a business-contextualized lens into the health of system flows. + - System flows should be aggregated with appropriate weights based on business criticality to build a meaningful definition of overall application health. + - Financially significant or customer-facing user flows should be prioritized. + - Each layer of the health model should capture what ‘healthy’ and ‘unhealthy’ states represent. + - Ensure the heath model can distinguish between transient and non-transient unhealthy states to isolate service degradation from unavailability. + +- Represent health states using a granular health score for every distinct application component and every user flow by aggregating health scores for mapped dependent components, considering key non-functional requirements as coefficients. + - The health score for a user flow should be represented by the lowest score across all mapped components, factoring in relative attainment against non-functional requirements for the user flow. + - The model used to calculate health scores must consistently reflect operating health, and if this is not the case, the model should be adjusted and redeployed to reflect new learnings. + - Define health score thresholds to reflect health status. + +- The health score must be calculated automatically based on underlying metrics, which can be visualized through observability patterns and acted on through automated operational procedures. + - The health score should become core to the monitoring solution, so that operating teams no longer have to interpret and map operational data to application health. + +- Leverage the health model to calculate availability SLO attainment instead of raw availability, ensuring the demarcation between service degradation and unavailability is reflected as separate SLOs. + +- Leverage the health model within CI/CD pipelines and test cycles to validate application health is maintained after code and configuration updates. + - The health model should be used to observe and validate health during load testing and chaos testing as part of CI/CD processes. + +- Building and maintaining a health model is an iterative process and engineering investment should be aligned to drive continuous improvements. + - Define a process to continually evaluate and fine-tune the accuracy of the model, and consider investing in machine learning models to further train the model. + +### Reference layered health model + +> Please note that this section provides a simplified representation of a layered application health model to assist readers with the underlying concept. For a more comprehensive and contextualized health model reference please refer to the [foundational-online](https://github.com/azure/alwayson-foundational-online) and [foundational-connected](https://github.com/azure/alwayson-foundational-connected) reference implementations. + +When implementing a health model it is critical to first define the health of individual components through the aggregation and interpretation of key resource-level metrics. An example of how resource metrics can used is the image below: + +[![AlwaysOn Example Health Definitions](/docs/media/alwayson-example-health-definitions.png "AlwaysOn Example Health Definitions")](./Health-Modeling.md) + +This definition of health can subsequently be represented by a KQL query, as demonstrated by the example AKS query below that aggregates InsightsMetrics (AKS Container insights) and AzureMetrics (Azure diagnostics) and compares (inner join) against modelled health thresholds. + +``` kql +// ClusterHealthStatus +let Thresholds=datatable(MetricName: string, YellowThreshold: double, RedThreshold: double) [ + // Disk Usage: + "used_percent", 50, 80, + // Network errors in: + "err_in", 0, 0, + // Network errors out: + "err_out", 0, 0, + // Average node cpu usage %: + "node_cpu_usage_percentage", 60, 90, + // Average node disk usage %: + "node_disk_usage_percentage", 60, 80, + // Average node memory usage %: + "node_memory_rss_percentage", 60, 80 + ]; +InsightsMetrics +| summarize arg_max(TimeGenerated, *) by Computer, Name +| project TimeGenerated,Computer, Namespace, MetricName = Name, Value=Val +| extend NodeName = extract("([a-z0-9-]*)(-)([a-z0-9]*)$", 3, Computer) +| union ( + AzureMetrics + | extend ResourceType = extract("(PROVIDERS/MICROSOFT.)([A-Z]*/[A-Z]*)", 2, ResourceId) + | where ResourceType == "CONTAINERSERVICE/MANAGEDCLUSTERS" + | summarize arg_max(TimeGenerated, *) by MetricName + | project TimeGenerated, MetricName, Namespace = "AzureMetrics", Value=Average + ) +| lookup kind=inner Thresholds on MetricName +| extend IsYellow = iff(Value > YellowThreshold and Value < RedThreshold, 1, 0) +| extend IsRed = iff(Value > RedThreshold, 1, 0) +| project NodeName, MetricName, Value, YellowThreshold, IsYellow, RedThreshold, IsRed +``` + +The resulting table output can subsequently be transformed into a health score for easier aggregation at higher levels of the health model. + +```kql +// ClusterHealthScore +ClusterHealthStatus +| summarize YellowScore = max(IsYellow), RedScore = max(IsRed) +| extend HealthScore = 1-(YellowScore*0.25)-(RedScore*0.5) +``` + +These aggregated scores can subsequently be represented as a dependency chart using visualization tools like Grafana to illustrate the health model. The image below depicts an example layered health model from the [foundational-online](https://github.com/azure/alwayson-foundational-online) reference implementation, and demonstrates how a change in health state for a foundational component can have a cascading impact to user flows and overall application health (the example values correspond to the table in the previous image). + +[![AlwaysOn Example Health Model Visualization](/docs/media/alwayson-example-fault-states.png "AlwaysOn Example Health Model Visualization")](./Health-Modeling.md) + +## Unified data sink for correlated analysis + +Numerous operational datasets must be gathered from all system components to accurately represent a defined heath model, considering logs and metrics from both application components and underlying Azure resources. This vast amount of data ultimately needs to be stored in a format that allows for near-real time interpretation to facilitate swift operational action. Moreover, correlation across all encompassed data sets is required to ensure effective analysis is unbounded, allowing for the layered representation of health. + +A unified data sink is therefore required to ensure all operational data is swiftly stored and made available for correlated analysis to build a 'single pane' representation of application health. Azure provides several different operational technologies under the umbrella of [Azure Monitor](https://docs.microsoft.com/azure/azure-monitor/overview#overview), and Azure Monitor Log Analytics serves as the core Azure-native data sink to store and analyze operational data. + +[![AlwaysOn Health Data Collection](/docs/media/alwayson-health-data-collection.png "AlwaysOn Health Data Collection")](./Health-Modeling.md) + +### Design considerations + +**Azure Monitor** + +- Azure Monitor is enabled by default for all Azure subscriptions, but Azure Monitor for Logs (Log Analytics) and Azure Application Insights resources must be deployed and configured to incorporate data collection and querying capabilities. + +- Azure Monitor supports three types of observability data: Logs, Metrics, and Distributed Traces. + - Logs are stored in Azure Monitor Logs (previously called Log Analytics) workspaces based on [Azure Data Explorer](https://docs.microsoft.com/azure/data-explorer/). + - Log queries are stored in query packs that can be shared across subscriptions, and are used to drive observability components such as dashboards, workbooks, or other reporting and visualization tools. + - Metrics are stored in an internal time-series diagnostic service database, which for most Azure resources is [retained](https://docs.microsoft.com/azure/azure-monitor/essentials/data-platform-metrics#retention-of-metrics) for 93 days. + - Metric collection is configured through resource Diagnostic settings. + +- All Azure resources expose logs and metrics, but resources must be appropriately configured to route diagnostic data to your desired data sink. + +> Azure provides a variety of [Built-In Policies](https://docs.microsoft.com/azure/azure-monitor/policy-reference) which can be applied to ensure deployed resources are configured to send logs and metrics to an Azure Monitor instance. + +- It is not uncommon for regulatory controls to require operational data remains within originating geographies or countries. + +- Regulatory requirements may stipulate the retention of critical data types for an extended period of time. + - For example, in regulated banking, audit data must be retained for at least 7 years. + +- Different operational data types may require different retention periods. + - For example, security logs may need to be retained for a long period, while performance data is unlikely to require long-term retention outside the context of AIOps. + +- Data can be [exported](https://docs.microsoft.com/azure/azure-monitor/logs/logs-data-export?tabs=portal) from Log Analytics Workspaces for long term retention and/or auditing purposes. + +- [Azure Monitor Logs Dedicated Clusters](https://docs.microsoft.com/azure/azure-monitor/logs/logs-dedicated-clusters) provides a deployment option which enables Availability Zones for protection from zonal failures in supported Azure regions. + - Dedicated Clusters require a minimum daily data ingest commitment. + +- Azure Monitor for Logs resources, including underlying log and metrics storage, are deployed into a specified Azure region. + +- To protect against loss of data from unavailability of an Azure Monitor for Logs workspace, resources can be configured with multiple Diagnostics configurations. + - Each Diagnostic configuration can target metrics and logs at a separate Azure Monitor for Log workspace. + - Each additional Azure Monitor for Logs workspace will incur extra costs. + - The redundant Azure Monitor for Logs workspaces can be deployed into the same Azure region, or into separate Azure regions for additional regional redundancy. + - Sending logs and metrics from an Azure resource to an Azure Monitor for Logs workspace in a different region will incur inter-region data egress costs. + - Some Azure resources require an Azure Monitor for Logs workspace within the same region as the resource itself. + +- Azure Monitor Logs workspace data [can be exported to Azure Storage or Azure Event Hubs on a continuous, scheduled, or one-time basis](https://docs.microsoft.com/azure/azure-monitor/logs/logs-data-export). + - Data export allows for long-term data archiving and protects against possible operational data loss due to unavailability. + - Available export destinations are Azure Storage or Azure Event Hub. + - Azure Storage can be configured for different [redundancy levels](https://docs.microsoft.com/azure/storage/common/storage-redundancy) including zonal or regional. + - Data export to Azure Storage stores the data within .json files. + - Data export destinations must be within the same Azure region as the Azure Monitor Logs workspace. + - An Event Hub data export destination to be within the same region as the Azure Monitor Logs workspace. + - Azure Event Hubs geo-disaster recovery is not applicable for this scenario. + - There are several [data export limitations](https://docs.microsoft.com/azure/azure-monitor/logs/logs-data-export?tabs=portal#limitations). + +> Only specific Azure Monitor Logs [tables are supported](https://docs.microsoft.com/azure/azure-monitor/logs/logs-data-export#supported-tables) for data export. + +- Azure Monitor Logs has [user query throttling limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#user-query-throttling) which may appear as reduced availability to clients, such as observability dashboards. + - Five concurrent queries per user: if five queries are already running, additional queries are placed in a per-user concurrency queue until a running query ends. + - Time in concurrency queue: if a query sits in the concurrency queue for over three minutes, it will be terminated and a 429 error code returned. + - Concurrency queue depth limit: the concurrency queue is limited to 200 queries, and additional queries will be rejected with a 429 error code. + - Query rate limit: there is a per-user limit of 200 queries per 30 seconds across all workspaces. + +- [Query Packs](https://docs.microsoft.com/azure/azure-monitor/logs/query-packs) are Azure Resource Manager resources which can be used to protect and recover Azure Monitor Logs queries in the event of Azure Monitor Logs workspace unavailability. + - Query Packs contain queries as JSON and can be stored external to Azure similar to other infrastructure-as-code assets. + - Deployable through the Microsoft.Insights REST API. + - If an Azure Monitor for Logs workspace must be re-created the Query Pack can be re-deployed from an externally stored definition. + +- Application Insights can be deployed in a workspace-based deployment model, underpinned by a Log Analytics Workspace where all the data is stored. + +- Sampling can be enabled within Application Insights to reduce the amount of telemetry sent and optimize data ingest costs. + +- Log Analytics and Application Insights [charge based on the volume of data ingested and the duration that data is retained for](https://azure.microsoft.com/pricing/details/monitor/). + - Data ingested into a Log Analytics Workspace can be retained at no additional charge up to first 31 days (90 days if Sentinel is enabled) + - Data ingested into a Workspace-based Application Insights is retained for the first 90 days at no extra charge. + +- The Log Analytics Commitment Tier pricing model provides a predictable approach to data ingest charges. + - Any usage above the reservation level is billed at the same price as the current tier. + +- Azure Monitor Log Analytics, Application Insights, and Azure Data Explorer use the Kusto Query Language (KQL). + +- Log Analytics queries are saved as *functions* within Log Analytics (`savedSearches`). + +### Design recommendations + +- Use Azure Monitor for Logs (Log Analytics) as a unified data sink to provide a 'single pane' across all operational data sets. + - Decentralize Log Analytics Workspaces across all leveraged deployment regions. Each Azure region with an application deployment should consider a Log Analytics Workspace to gather all operational data originating from that region. All global resources should leverage a separate dedicated Log Analytics Workspace which should be deployed within a primary deployment region. + - Sending all operational data to a single Log Analytics Workspace would create a single point of failure. + - Requirements for data residency might prohibit data leaving the originating region, and federated workspaces solves for this requirement by default. + - There is a substantial egress cost associated with transferring logs and metrics across regions. + - All deployment stamps within the same region can leverage the same regional Log Analytics Workspace. + +- Consider configuring resources with multiple diagnostic configurations pointing to different Azure Monitor for Logs workspaces to protect against Azure Monitor unavailability for applications with fewer regional deployment stamps. + +- Use Application Insights as a consistent Application Performance Monitoring (APM) tool across all application components to collect application logs, metrics, and traces. + - Deploy Application Insights in a workspace-based configuration to ensure each regional Log Analytics Workspaces contains logs and metrics from both application components and underlying Azure resources. + +- Leverage [Cross-Workspace queries](https://docs.microsoft.com/azure/azure-monitor/logs/cross-workspace-query) to maintain a unified 'single pane' across the different workspaces. + +- Leverage [Query Packs](https://docs.microsoft.com/azure/azure-monitor/logs/query-packs) to protect Azure Monitor Logs queries in the event of workspace unavailability. + - Store query packs within the application git repository as infrastructure-as-code assets. + +- All Log Analytics Workspaces should be treated as long-running resources with a different life-cycle to application resources within a regional deployment stamp. + +- Export critical operational data from Log Analytics for long-term retention and analytics to facilitate AIOps and advanced analytics to refine the underlying the health model and inform predictive action. + +- Carefully evaluate which data store should be used for long-term retention; not all data has to be stored in a hot and queryable data store. + - It is strongly recommended to use Azure Storage in a GRS configuration for long-term operational data storage. + - Use the Log Analytics Export capability to export all available data sources to Azure Storage. + +- Select appropriate retention periods for operational data types within log analytics, configuring longer retention periods within the workspace where 'hot' observability requirements exist. + +- Use Azure Policy to ensure all regional resources route operational data to the correct Log Analytics Workspace. + +> In an Enterprise Scale environment, if there is a requirement for centralized storage of operational data, either a) [fork](https://docs.microsoft.com/azure/azure-monitor/logs/logs-data-export?tabs=portal) data at instantiation so it is ingested into both centralized tooling and Log Analytics Workspaces dedicated to the application, or b) expose access to application Log Analytics workspaces so that central teams can query application data. It is ultimately critical that operational data originating from the solution is available within Log Analytics Workspaces dedicated to the application. + +> If SIEM integration is required, do not send raw log entries, but instead send critical alerts. + +- Only configure sampling within Application Insights if it is required to optimize performance, or if not sampling becomes cost prohibitive. + - Excessive sampling can lead to missed or inaccurate operational signals. + +- Use correlation IDs for all trace events and log messages to tie them to a given request. + - Return correlation IDs to the caller for all calls not just failed requests. + +- Ensure application code incorporates proper instrumentation and logging to inform the health model and facilitate subsequent troubleshooting or root cause analysis when required. + - Application code should leverage Application Insights to facilitate [Distributed Tracing](https://docs.microsoft.com/dotnet/core/diagnostics/distributed-tracing-concepts), by providing the caller with a comprehensive error message that includes a correlation ID when a failure occurs. + +- Use [structured logging](https://stackify.com/what-is-structured-logging-and-why-developers-need-it/) for all log messages. + +- Add meaningful health probes to all application components. + - When using AKS, configure the health endpoints for each deployment (pod) so that Kubernetes can correctly determine when a pod is healthy or unhealthy. + - When using Azure App Service, configure the [Health Checks](https://docs.microsoft.com/azure/app-service/monitor-instances-health-check) so that scale out operations will not cause errors by sending traffic to instances which are not-yet ready, and making sure unhealthy instances are recycled quickly. + +> If the application is subscribed to Microsoft Mission-Critical Support, consider exposing key health probes to Microsoft Support, so application health can be modelled more accurately by Microsoft Support. + +- Log successful health check requests, unless increased data volumes cannot be tolerated in the context of application performance, since they provide additional insights for analytical modelling. + +- Do not configure production Log Analytics Workspaces to apply a daily cap, which limits the daily ingestion of operational data, since this can lead to the lose of critical operational data. + - In lower environments, such as Development and Test, it can be considered as an optional cost saving mechanism. + +- Provided operational data ingest volumes meet the minimum tier threshold, configure Log Analytics Workspaces to use Commitment Tier based pricing to drive cost efficiencies relative to the 'pay-as-you-go' pricing model. + +- It is strongly recommended to store Log Analytics queries using source control and use CI/CD automation to deploy them to relevant Log Analytics instances. + +## Dashboarding + +Visually representing the health model alongside critical operational data is essential to achieve effective operations and maximize reliability. Dashboards should ultimately be utilized to provide near-real time insights into application health for DevOps teams, facilitating the swift diagnosis of deviations from steady state. + +Microsoft provides several data visualization technologies, including Azure Dashboards, PowerBI, and Azure Managed Grafana (currently in-preview). Azure Dashboards is positioned to provide a tightly integrated out-of-the-box visualization solution for operational data within Azure Monitor. It therefore has a fundamental role to play in the visual representation of operational data and application health for an AlwaysOn solution. However, there are several limitations in terms of the positioning of Azure Dashboards as a holistic observability platform, and as a result consideration should be given to the supplemental use of market-leading observability solutions, such as Grafana which is also provided as a managed solution within Azure. + +This section will therefore focus on the use of Azure Dashboards and Grafana to build a robust dashboarding experience capable of providing technical and business lenses into application health, enabling DevOps teams and effective operation. + +>Robust dashboarding is essential to diagnose issues that have already occurred, and support operational teams in detecting and responding to issues as they happen. + +### Design considerations + +- When visualizing the health model using Log Analytics queries, note that there are [Log Analytics limits on concurrent and queued queries, as well as the overall query rate](https://docs.microsoft.com/azure/azure-monitor/service-limits#user-query-throttling), with subsequent queries queued and throttled. + +- Queries to retrieve operational data used to calculate and represent health scores can be written and executed in either Azure Monitor Log Analytics or Azure Data Explorer. + - Sample queries are available [here](https://docs.microsoft.com/azure/azure-monitor/logs/examples). + +- Log Analytics imposes several [query limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#action-groups) which must be designed for when designing operational dashboards. + +- The visualization of raw resource metrics, such as CPU utilization or network throughput, requires manual evaluation by operations teams to determine health status impacts, and this can be challenging during an active incident. + +- If multiple users leverage dashboards within a tool like Grafana, the number of queries sent to Log Analytics multiplies quickly. + - Reaching the concurrent query limit on Log Analytics will queue subsequent queries, making the dashboard experience feel 'slow'. + +### Design recommendations + +- Collect and present queried outputs from all regional Log Analytics Workspaces and the global Log Analytics Workspace to build a unified view of application health. + +> When deploying into an Enterprise-Scale architecture, consideration should be given to also query the [central platform Log Analytics Workspace](https://docs.microsoft.com/azure/cloud-adoption-framework/ready/enterprise-scale/management-and-monitoring#plan-platform-management-and-monitoring) if key dependencies on platform resources exist, such as Express Route for scenarios involving on-premises communication. + +- A ‘traffic light’ model should be used to visually represent 'healthy' and 'unhealthy' states, with green used to illustrate when key non-functional requirements are fully satisfied and resources are optimally utilized. + - Use "Green", "Amber, and "Red" to represent "Healthy", "Degraded", and "Unavailable" states. + +- Leverage Azure Dashboards to create operational lenses for global resources and regional deployment stamps, representing key metrics such as request count for Azure Front Door, server side latency for Cosmos DB, incoming/outgoing messages for Event Hub, and CPU utilization or deployment statuses for AKS. + - Dashboards should be tailored to drive operational effectiveness, infusing learnings from failure scenarios to ensure DevOps teams have direct visibility into key metrics. + +- If Azure Dashboards cannot be used to accurately represent the health model and requisite business requirements, then it is strongly recommended to consider Grafana as an alternative visualization solution, providing market-leading capabilities and an extensive open-source plugin ecosystem. + - Evaluate the managed Grafana preview offering to avoid the operational complexities of managing Grafana infrastructure. + +- When deploying self-hosted Grafana, employ a highly-available and geo-distributed design to ensure critical operational dashboards can be resilient to regional platform failures and cascading error scenarios. + - Separate configuration state into an external datastore, such as Azure Database for Postgres or MySQL, to ensure Grafana application nodes remain stateless. + - Configure database replication across deployment regions. + - Deploy Grafana nodes to App Services in a highly-available configuration across ones within a region, using container based deployments. + - Deploy App Service instances across considered deployment regions. + + > App Services provides a low-friction container platform which is ideal for low-scale scenarios such as operational dashboards, and isolating Grafana from AKS provides a clear separation of concern between the primary application platform and operational representations for that platform. Please refer to the Application Platform deign area for further configuration recommendations. + + - Use Azure Storage in a GRS configuration to host and manage custom visuals and plugins. + - Deploy app service and database read-replica Grafana components to a minimum of 2 deployment regions, and consider employing a model where Grafana is deployed to all considered deployment regions. + +> For scenarios targeting a >= 99.99% SLO, Grafana should be deployed within a minimum of 3 deployment regions to maximize overall reliability for key operational dashboards. + +- Mitigate Log Analytics query limits by aggregating queries into a single or small number of queries, such as by using the KQL 'union' operator, and set an appropriate refresh rate on the dashboard. + - An appropriate maximum refresh rate will depend on the number and complexity of dashboard queries; analysis of implemented queries is required. + +- If the concurrent query limit of log analytics is being reached, consider optimizing the retrieval pattern by (temporarily) storing the data required for the dashboard in a high performance datastore such as Azure SQL. + +## Automated incident response + +While the visual representations of application health provides invaluable operational and business insights to support issue detection and diagnosis, it relies on the readiness and interpretations of operational teams, as well as the effectiveness of subsequent human-triggered responses. Therefore, to maximize reliability it is necessary to implement extensive alerting to detect proactively respond to issues in near real-time. + +[Azure Monitor](https://docs.microsoft.com/azure/azure-monitor/alerts/alerts-overview) provides an extensive alerting framework to detect, categorize, and respond to operational signals through [Action Groups](https://docs.microsoft.com/azure/azure-monitor/alerts/action-groups). This section will therefore focus on the use of Azure Monitor alerts to drive automated actions in response to current or potential deviations from a healthy application state. + +>Alerting and automated action is critical to effectively detect and swiftly respond to issues as they happen, before greater negative impact can occur. Alerting also provides a mechanism to interpret incoming signals and respond to prevent issues before they occur. + +### Design considerations + +- Alert rules are defined to fire when a conditional criteria is satisfied for incoming signals, which can include a variety of [data sources](https://docs.microsoft.com/azure/azure-monitor/agents/data-sources), such as metrics, log search queries, or availability tests. + +- Alerts can be defined within Log Analytics or Azure Monitor on the specific resource. + +- Some metrics are only interrogatable within Azure Monitor, since not all diagnostic data points are made available within Log Analytics. + +- The Azure Monitor Alerts API can be leveraged to retrieve active and historic alerts. + +- There are subscription limits related to alerting and action groups which must be designed for: + - [Limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#alerts) exist for the number of configurable alert rules. + - The Alerts API has [throttling limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#alerts-api) which should be considered for extreme usage scenarios. + - Action Groups have [several hard limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#action-groups) for the number of configurable responses which must be designed for. + - Each response type has a limit of 10 actions, apart from email which has a limit of 1,000 actions. + +- Alerts can be integrated within a layered health model by creating an Alert Rule for a saved log search query from the model's 'root' scoring function. + - For example, using 'WebsiteHealthScore' and alerting on a numeric value that represents an 'Unhealthy' state. + +### Design recommendations + +- For resource-centric alerting, create alert rules within Azure Monitor to ensure all diagnostic data is available for the alert rule criteria. + +- Consolidate automated actions within a minimal number of Action Groups, aligned with service teams to support a DevOps approach. + +- Respond to excessive resource utilization signals through automated scale operations, leveraging Azure-native auto-scale capabilities where possible. Where built-in auto-scale functionality is not applicable, use the component health score to model signals and determine when to respond with automated scale operations. + - Ensure automated scale operations are defined according to a capacity model which quantifies scale relationships between components, so that scale responses encompass components which need to be scaled in relation to other components. + +- Model actions to accommodate a prioritized ordering which should be determined by business impact. + +- Leverage the Azure Monitor Alerts API to gather historic alerts to incorporate within 'cold' operational storage for advanced analytics. + +- For critical failure scenarios which cannot be met with an automated response, ensure operational 'runbook automation' is in-place to drive swift and consistent action once manual interpretation and sign-off is provided. + - Leverage alert notifications to drive swift identification of issues requiring manual interpretation + +- Create allowances within engineering sprints to drive incremental improvements in alerting to ensure new failure scenarios which have not previously been considered can be fully accommodated within new automated actions. + +- Conduct operational readiness tests as part of CI/CD processes to validate key alert rules for deployment updates. + +## Predictive action and AIOps + +Machine learning models can be applied to correlate and prioritize operational data, helping to gather critical insights related to filtering excessive alert 'noise' and predicting issues before they cause impact, as well as accelerating incident response when they do. + +More specifically, an AIOps methodology can be applied to distil critical insights about the behavior of the system, users, and DevOps processes. These insights can include identifying a problem happening now (*detect*), quantifying why the problem is happening (*diagnose*), or signaling what will happen in the future (*predict*). Such insights can be used to drive actions which adjust and optimize the application to mitigate active or potential issues, leveraging key business metrics, system quality metrics, and DevOps productivity metrics, to prioritize according to business impact. Conducted actions can themselves be infused into the system though a feedback loop which further trains the underlying model to drive additional efficiencies. + +[![AlwaysOn AIOps Methodologies](/docs/media/alwayson-aiops-methodology.png "AlwaysOn AIOps Methodologies")](./Health-Modeling.md) + +There are multiple analytical technologies within Azure, such as Azure Synapse and Azure Databricks, which can be leveraged to build and train analytical models for AIOps. This section will therefore focus on how these technologies can be positioned within an AlwaysOn application design to accommodate AIOps and drive predictive action, focusing on Azure Synapse which reduces friction by bringing together the best of Azure's data services along with powerful new features. + +>AIOps is used to drive predictive action, interpreting and correlating complex operational signals observed over a sustained period in order to better respond to and prevent issues before they occur. + +### Design considerations + +- Azure Synapse Analytics offers multiple Machine Learning (ML) capabilities. + - ML models can be trained and run on Synapse Spark Pools with libraries including MLLib, SparkML and MMLSpark, as well as popular open-source libraries, such as [Scikit Learn](https://scikit-learn.org/stable/). + - ML models can be trained with common data science tools like PySpark/Python, Scala, or .NET. + +- Synapse Analytics is integrated with Azure ML through Azure Synapse Notebooks, which enables ML models to be trained in an Azure ML Workspace using [Automated ML](https://docs.microsoft.com/azure/machine-learning/concept-automated-ml). + +- Synapse Analytics also enables ML capabilities using [Azure Cognitive Services](https://docs.microsoft.com/azure/cognitive-services/what-are-cognitive-services) to solve general problems in various domains, such as [Anomaly Detection](https://docs.microsoft.com/azure/cognitive-services/anomaly-detector/). Cognitive Services can be used in Azure Synapse, Azure Databricks, and via SDKs and REST APIs in client applications. + +- Azure Synapse natively integrates with [Azure Data Factory](https://docs.microsoft.com/azure/data-factory/introduction) tools to extract, transform, and load (ETL) or ingest data within orchestration pipelines. + +- Azure Synapse enables external dataset registration to data stored in Azure Blob storage or Azure Data Lake Storage. + - Registered datasets can be used in Synapse Spark pool data analytics tasks. + +- Azure Databricks can be integrated into Azure Synapse Analytics pipelines for additional Spark capabilities. + - Synapse orchestrates reading data and sending it to a Databricks cluster, where it can be transformed and prepared for ML model training. + +- Source data typically needs to be prepared for analytics and ML. + - Synapse offers various tools to assist with data preparation, including Apache Spark, Synapse Notebooks, and serverless SQL pools with T-SQL and built-in visualizations. + +- ML models that have been trained, operationalized, and deployed can be used for _batch_ scoring in Synapse. + - AIOps scenarios, such as running regression or degradation predictions in CI/CD pipelined, may require _real-time_ scoring. + +- There are subscription limits for [Azure Synapse](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#azure-synapse-analytics-limits) which should be fully understood in the context of an AIOps methodology. + +- To fully incorporate AIOps it is necessary to feed near real-time observability data into real-time ML inference models on an ongoing basis. + - Capabilities such as anomaly detection should be evaluated within the observability data stream. + +### Design recommendations + +- Ensure all Azure resources and application components are fully instrumented so that a complete operational dataset is available for AIOps model training. + +- Ingest Log Analytics operational data from the global and regional Azure Storage Accounts into Azure Synapse for analysis. + +- Use the Azure Monitor Alerts API to retrieve historic alerts and store it within cold storage for operational data to subsequently use within ML models. If Log Analytics data export is used, store historic alerts data in the same Azure Storage accounts as the exported Log Analytics data. + +- After ingested data is prepared for ML training, write it back out to Azure Storage so that it is available for ML model training without requiring Synapse data preparation compute resources to be running. + +- Ensure ML model operationalization supports both batch and real-time scoring. + +- As AIOps models are created, implement MLOps and apply DevOps practices to [automate the ML lifecycle](https://docs.microsoft.com/azure/machine-learning/concept-model-management-and-deployment#automate-the-ml-lifecycle) for training, operationalization, scoring, and continuous improvement. + - Create an iterative CI/CD process for AIOps ML models. + +- Evaluate [Azure Cognitive Services](https://docs.microsoft.com/azure/cognitive-services/what-are-cognitive-services) for specific predictive scenarios due to their low administrative and integration overhead. + - Consider [Anomaly Detection](https://docs.microsoft.com/azure/cognitive-services/anomaly-detector/) to quickly flag unexpected variances in observability data streams. + +|Previous Page|Next Page| +|:--|:--| +|[Data Platform](./Data-Platform.md) |[Deployment and Testing](./Deployment-Testing.md) | + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Networking.md b/docs/design-methodology/Networking.md new file mode 100644 index 00000000..c0250578 --- /dev/null +++ b/docs/design-methodology/Networking.md @@ -0,0 +1,552 @@ +# Networking and connectivity + +Networking is a fundamental design topic for an AlwaysOn application, particularly given the recommended globally distributed active-active design approach. + +This section will therefore explore various network topology topics at an application level, considering requisite connectivity and redundant traffic management. More specifically, it will highlight critical considerations and recommendations intended to inform the design of a secure and scalable global network topology for an AlwaysOn application. + +- [Global Traffic Routing](#global-traffic-routing) +- [Application Delivery Services](#application-delivery-services) +- [Virtual Network Integration](#virtual-network-integration) +- [Internet Egress](#internet-egress) +- [Inter-Zone and Inter-Region Connectivity](#inter-zone-and-inter-region-connectivity) +- [Micro-Segmentation and Kubernetes Network Policies](#micro-segmentation-and-kubernetes-network-policies) + +## Global traffic routing + +The use of multiple active regional deployment stamps necessitates a global routing service to distribute traffic to each active stamp. + +Within Azure, [Azure Front Door](https://azure.microsoft.com/services/frontdoor/), [Azure Traffic Manager](https://azure.microsoft.com/services/traffic-manager/), and [Azure Standard Load Balancer](https://docs.microsoft.com/azure/load-balancer/cross-region-overview) provide requisite routing capabilities to manage global traffic across a multi-region application. + +> There are also a variety of third-party globally routing technologies, such as those offered by CDN providers, which can almost seamlessly be swapped in to replace or extend the use of Azure-native global routing services. + +This section will therefore explore key differences between Azure Front Door, Azure Traffic Manager, and Azure Standard Load Balancer to define how each can can be leveraged to optimize different scenarios. + +### Design considerations + +- A routing service bound to a single region represents a single-point-of-failure and a significant risk with regards to regional outages. + +- If the application workload scenario encompasses client control, such as with mobile or desktop client applications, it is possible to provide service redundancy within client routing logic. + - Multiple global routing technologies, such as Azure Front Door and Azure Traffic Manager, can be considered in parallel for redundancy, with clients configured to failover to an alternative technology when certain failure conditions are met. + - The introduction of multiple global routing services introduces significant complexities around edge caching and Web Application Firewall capabilities, as well as certificate management for SSL offload and application validation for ingress paths. + - Third-party technologies can also be considered, providing global routing resiliency to all levels of Azure platform failures. + +- Capability disparity between Azure Front Door and Traffic Manager means that if the two technologies are positioned alongside one another for redundancy, a different ingress path or design changes would be required to ensure a consistent and acceptable level of service is maintained. + +- Azure Front Door and Azure Traffic Manager are globally distributed services with built-in multi-region redundancy and availability. + - Hypothetical failure scenarios of a scale large enough to threaten the global availability of these resilient routing services presents a broader risk to the application in terms of cascading and correlated failures. + - Failure scenarios of this scale are only feasibly caused by shared foundational services, such as Azure DNS or Azure AD which serve as global platform dependencies for almost all Azure services. + - If a redundant Azure technology is applied it is likely that the secondary service will also be experiencing unavailability or a degraded service. + - Global routing service failure scenarios are highly likely to significantly impact many other services used for key application components through interservice dependencies. + - Even if a third-party technology is used, the application will likely be in an un-heathy state due to the broader impact of the underlying issue, meaning that routing to application endpoints on Azure will provide little value anyway. + +- Global routing service redundancy provides mitigation for an extremely small number of hypothetical failure scenarios, where the impact of a global outage is constrained to the routing service itself. + - To provide broader redundancy to global outage scenarios, a multi-cloud active-active deployment approach can be considered. + - A multi-cloud active-active deployment approach introduces significant operational complexities which pose significant resiliency risks, likely far outweighing the hypothetical risks of a global outage. + +- For scenarios where client control is not possible, a dependency must be taken on a single global routing service to provide a unified entry point for all active deployment regions. + - When used in isolation they represent a single-point-of-failure at a service level due to global dependencies, even though built-in multi-region redundancy and availability is provided. + - The SLA provided by the selected global routing service represents the maximum attainable composite SLA, regardless of how many deployment regions are considered. + +- When client control is not possible, operational mitigations can be considered to define a process for migrating to a secondary global routing service in the event that a global outage disables the primary service. + - Migrating from one global routing service to another is typically a lengthy process lasting several hours, particularly where DNS propagation is considered. + +- Some third-party global routing services provide a 100% SLA. However, the historic and attainable SLA provided by these services is typically lower than 100%. + - While these services provide financial reparations for unavailability, it comes of little significance when the impact of unavailability is significant, such as with safety-critical scenarios where human life is ultimately at stake. + - Technology redundancy or sufficient operational mitigations should therefore still be considered even when the advertised legal SLA is 100%. + +**Azure Front Door** + +- Azure Front Door provides global HTTP/S load balancing and optimized connectivity using the Anycast protocol with split TCP to take advantage of the Microsoft global backbone network. + - A number of connections are maintained for each of the backend endpoints. + - Incoming client requests are first terminated at the edge node closest to the originating client. + - After any required traffic inspection, requests are either forwarded over the Microsoft backbone to the appropriate backend using existing connections, or served from the internal cache of an edge node. + - This approach is very efficient in spreading high traffic volumes over the backend connections. + +- Provides a built-in cache that serves static content from edge nodes. + - In many use cases this can also eliminate the need for a dedicated Content Delivery Network (CDN). + +- Azure Web Application Firewall (WAF) can be used on Azure Front Door, and since it is deployed to Azure network edge locations around the globe, every incoming request delivered by Front Door in inspected at the network edge. + +- Azure Front Door protects application endpoints against DDoS attacks using [Azure DDoS protection Basic](https://docs.microsoft.com/azure/frontdoor/front-door-ddos#integration-with-azure-ddos-protection-basic). + - Azure DDoS Standard provides additional and more advanced protection and detection capabilities and can be added as an additional layer to Azure Front Door. + +- Azure Front Door offers a fully managed certificate service. + - Enables TLS connection security for endpoints without having to manage the certificate lifecycle. + +- Azure Front Door Premium supports private endpoints, enabling traffic to flow from the internet directly onto Azure virtual networks. + - This would eliminate the need of using public IPs on the VNet for making the backends accessible via Azure Front Door Premium. + +- Azure Front Door relies on health probes and backend health endpoints (URLs) which are called on an interval basis to return a HTTP status code reflecting if the backend is operating normally, with a HTTP 200 (OK) response reflecting a healthy status. + - As soon as a backend reflects an unhealthy status, from the perspective of a certain edge node, that edge node will stop sending requests there. Unhealthy backends are therefore transparently removed from traffic circulation without any delay. + +- Supports HTTP/S protocols only. + +- The Azure Front Door WAF and Application Gateway WAF provide a slightly different feature set, though both support built-in and custom rules and can be set to operate in either detection mode or prevention mode. + +- The Front Door backend IP space may change, but Microsoft will ensure integration with [Azure IP Ranges and Service Tags](https://www.microsoft.com/download/details.aspx?id=56519) + - It is possible to subscribe to Azure IP Ranges and Service Tags to receive notifications about any changes or updates. + +- Azure Front Door supports various [load distribution configurations](https://docs.microsoft.com/azure/frontdoor/front-door-routing-methods): + - Latency-based: the default setting which routes traffic to the "closest" backend from the client; based on request latency. + - Priority-based: useful for active-passive setups, where traffic must always be sent to a primary backend unless it is not available. + - Weighted: applicable for canary deployments in which a certain percentage of traffic is sent to a specific backend. + - If multiple backends have the same weights assigned, latency-based routing is used. + +- By default Azure Front Door uses latency-based routing which can lead to situations where some backends get a lot more incoming traffic then others, depending on where clients originate from. + +- If a series of client requests must be handled by the same backend, [Session Affinity](https://docs.microsoft.com/azure/frontdoor/front-door-routing-methods#session-affinity) can be configured on the frontend. + - It uses a client-side cookie to send subsequent requests to the same backend as the first request, provided the backend is still available. + +**Azure Traffic Manager** + +- Azure Traffic Manager is a DNS redirection service. + - The actual request payload is not processed, but instead Traffic Manager returns the DNS name of one of the backends it the pool, based on configured rules for the selected traffic routing method. + - The backend DNS name is then resolved to its final IP address which is subsequently directly called by the client. + +- The DNS response is cached and re-used by the client for a specified Time-To-Live (TTL) period, and requests made during this period will go directly to the backend endpoint without Traffic Manager interaction. + - Eliminates the extra connectivity step which provides cost benefits compared to Front Door. + +- Since the request is made directly from the client to the backend service, any protocol supported by the backend can be leveraged. + +- Similar to Azure Front Door, Azure Traffic Manager also relies on health probes to understand if a backend is healthy and operating normally. If another value is returned or nothing is returned, the routing service recognizes ongoing issues and will stop routing requests to that specific backend. + - However, unlike with Azure Front Door this removal of unhealthy backends is not instantaneous since clients will continue to create connections to the unhealthy backend until the DNS TTL expires and a new backend endpoint is requested from the Traffic Manager service. + - In addition, even when the TTL expires, there is no guarantee that public DNS servers will honor this value, so DNS propagation can actually take much longer to occur. + - This means that traffic may continue to be sent to the unhealthy endpoint for a sustained period of time. + +**Azure Standard Load Balancer** + +> As of December 2021, the Cross-Region Standard Load Balancer is available in preview with [technical limitations](https://docs.microsoft.com/azure/load-balancer/cross-region-overview#limitations) that prevent consideration in an AlwaysOn context. + +> - Integration with AKS is no available: loss of connectivity will occur when deploying a cross-region load balancer with an AKS cluster as a backend. +> - Frontend IP configurations have to be public: internal frontend endpoints are not supported. + +- Azure Standard Load Balancer provides a cross-region load distribution capability, which enables geo-redundant high-availability scenarios. + - Offers instant global failover, load distribution across regions to the closest Azure region, and the ability to scale up/down behind a single endpoint. + +- Cross-region load balancer is configured with a global frontend public IP address and uses health probes to monitor availability of regional load balancers. + - If there is a regional failure, the affected load balancer is taken out of rotation until it becomes available again. + +### Design recommendations + +- Use Azure Front Door as the primary global traffic routing service for HTTP/S scenarios. + - Azure Front Door is strongly advocated for HTTP/S workloads as it provides optimized traffic routing, transparent failover, private backend endpoints (with the Premium SKU), edge caching and integration with Web Application Firewall (WAF). + +- For application scenarios where client control is possible, apply client side routing logic to consider failover scenarios where the primary global routing technology fails. + - Two or more global routing technologies should be positioned in parallel for added redundancy, if single service SLA isn't sufficient. Client logic is required to route to the redundant technology in the event of a global service failure. + - Two distinct URLs should be used, with one applied to each of the different global routing services to simplify the overall certificate management experience and routing logic in the event of a failover. + - Prioritize the use of third-party routing technologies as the secondary failover service, since this will mitigate the largest number of global failure scenarios and the capabilities offered by industry leading CDN providers will allow for a consistent design approach. + - Consideration should also be given to directly routing to a single regional stamp rather than a separate routing service. While this will result in a degraded level of service, it represents a far simpler design approach. + +Following diagram illustrates a redundant global load balancer configuration with client failover using Azure Front Door as primary global load balancer. + +[![AlwaysOn Global Load Balancer Configuration](/docs/media/alwayson-global-routing.gif "AlwaysOn Global Load Balancer Configuration")](./Networking.md) + +> To truly mitigate the risk of global failures within the Azure platform, a multi-cloud active-active deployment approach should be considered, with active deployment stamps hosted across two or more cloud providers and redundant third-party routing technologies used for global routing. +> However, it is strongly recommended not to apply a multi-cloud approach since it introduces significant operational complexity, with different deployment stamp definitions and representations of operational health across the different cloud platforms. This complexity in-turn introduces numerous resiliency risks within the normal operation of the application, which far outweigh the hypothetical risks of a global platform outage. + +- Whilst it is not recommended, for HTTP(s) workloads using Azure Traffic Manager for global routing redundancy to Azure Front Door, consider if WAF execution can be offloaded to Application Gateway for 'happy path traffic' flowing through Azure Front Door. + - This will introduce an additional failure point to the standard ingress path, an additional critical-path component to manage and scale, and will also incur additional costs to ensure global high-availability. It will, however, greatly simplify the failure scenario by providing consistency between the 'happy' and 'un-happy' ingress paths through Azure Front Door and Azure Traffic Manager, both in terms of WAF execution but also private application endpoints. + - The loss of edge caching in a failure scenario will impact overall performance, and this must be aligned with an acceptable level of service or mitigating design approach. + - To ensure a consistent level of service, consider offloading edge caching to a third-party CDN provider for both 'happy' and 'un-happy' paths. + +> Whilst it deviates from the Azure-native design principle, it is recommended to consider a third-party global routing service in place of two Azure global routing services, since this provides the maximum level of fault mitigation and the a more simple design approach since most industry leading CDN providers offer edge capabilities largely consistent with that offered by Azure Front Door. + +**Azure Front Door** + +- Use the Azure Front Door managed certificate service to enable TLS connections, and remove the need to manage certificate lifecycles. + +- Use the Azure Front Door Web Application Firewall (WAF) to provide protection at the edge from common web exploits and vulnerabilities, such as SQL injection. + +- Use the Azure Front Door built-in cache to serve static content from edge nodes. + - In most cases this will also eliminate the need for a dedicated Content Delivery Network (CDN). + +- Configure the application platform ingress points to [validate incoming requests through header based filtering](https://docs.microsoft.com/azure/frontdoor/front-door-faq#how-do-i-lock-down-the-access-to-my-backend-to-only-azure-front-door-) using the *X-Azure-FDID* to ensure all traffic is flowing through the configured Front Door instance. + - Consider also configuring IP ACLing using Front Door Service Tags to validate traffic originates from the Azure Front Door backend IP address space and Azure infrastructure services. This will ensure traffic flows through Azure Front Door at a service level, but header based filtering will still be required to ensure the use of a configured Front Door instance. + +- Define a custom TCP health endpoint to validate critical downstream dependencies within a regional deployment stamp, including data platform replicas, such as Cosmos DB in the example provided by the foundational reference implementation. + - If one or more dependencies becomes unhealthy, the health probe should reflect this in the response returned so that the entire regional stamp can be taken out of circulation. + +- Ensure health probe responses are logged and ingest all operational data exposed by Azure Front Door into the global Log Analytics workspace to facilitate a unified data synch and single operational view across the entire application. + +- Unless the workload is extremely latency sensitive, spread traffic evenly across all considered regional stamps to most effectively use deployed resources. + - To achieve this, set the ["Latency Sensitivity (Additional Latency)"](https://docs.microsoft.com/azure/frontdoor/front-door-backend-pool#load-balancing-settings) parameter to a value that is high enough to cater for latency differences between the different regions of the backends. + - Ensure a tolerance that is acceptable to the application workload regarding overall client request latency. + +- Do not enable Session Affinity unless it is required by the application, since it can have a negative impact the balance of traffic distribution. + - With a fully stateless application, if the recommended AlwaysOn application design approach is followed, any request could be handled by any of the regional deployments. + +**Azure Traffic Manager** + +- Use Traffic Manager for non HTTP/S scenarios as a replacement to Azure Front Door. + - Capability differences will drive different design decisions for cache and WAF capabilities, as well as TLS certificate management. + +- WAF capabilities should be considered within each region for the Traffic Manager ingress path, using Azure Application Gateway. + +- Configure a suitably low TTL value to optimize the time required to remove an unhealthy backend endpoint from circulation in the event that backend becomes unhealthy. + +- Similar to with Azure Front Door, a custom TCP health endpoint should be defined to validate critical downstream dependencies within a regional deployment stamp, which should be reflected in the response provided by health endpoints. + - However, in the case of Traffic Manager additional consideration should be given to service level regional failover (i.e. 'dog legging') to mitigate the potential delay associated with the removal of an unhealthy backend due to dependency failures, particularly if it is not possible to set a low TTL for DNS records. + +- Consideration should be given to third-party CDN providers in order to achieve edge caching when using Azure Traffic Manager as a primary global routing service. + - Where edge WAF capabilities are also offered by the third-party service, consideration should be given to simplify the ingress path and potentially remove the need for Application Gateway. + +## Application delivery services + +In addition to redundant global traffic routing, the network ingress path for an AlwaysOn application must also consider requisite application delivery services to ensure secure, reliable, and scalable ingress traffic. + +This section will therefore build on recommendations from the previous global routing section by exploring key application delivery capabilities, considering relevant services such as Azure Standard Load Balancer, Azure Application Gateway, and Azure API Management. + +### Design considerations + +- TLS encryption is critical to ensure the integrity of inbound user traffic to an AlwaysOn application, with **TLS Offloading** applied only at the point of a stamp's ingress to decrypt incoming traffic. + - TLS Offloading Requires the private key of the TLS certificate to decrypt traffic. + +- A **Web Application Firewall** provides protection against common web exploits and vulnerabilities, such as SQL injection or cross site scripting, and is essential to achieve the maximum reliability aspirations of an AlwaysOn application. + +- Azure WAF provides out-of-the-box protection against the top 10 OWASP vulnerabilities using managed rule sets. + - Custom rules can also be added to extend the managed rule set. + - Azure WAF can be enabled within either Azure Front Door, Azure Application Gateway, or Azure CDN (currently in public preview). + - The features offered on each of the services differ slightly. For example, the Azure Front Door WAF provides rate limiting, geo-filtering and bot protection, which are not yet offered within the Application Gateway WAF. However, they all support both built-in and custom rules and can be set to operate in detection mode or prevention mode. + - The roadmap for Azure WAF will ensure a consistent WAF feature set is provided across all service integrations. + +- Third-party WAF technologies such as NVAs and advanced ingress controllers within Kubernetes can also be considered to provide requisite vulnerability protection. + +- Optimal WAF configuration typically require fine tuning, regardless of the technology used. + + **Azure Front Door** + +- Azure Front Door only accepts HTTP and HTTPS traffic, and will only process requests with a known `Host` header. + - This protocol blocking helps to mitigate volumetric attacks spread across protocols and ports, as well as DNS amplification and TCP poisoning attacks. + +- Azure Front Door is a global Azure resource so configuration is deployed globally to all [edge locations](https://docs.microsoft.com/azure/frontdoor/edge-locations-by-region). + - Resource configuration can be distributed at a massive scale to handle hundreds of thousands of requests per second. + - Updates to configuration, including routes and backend pools, are seamless and will not cause any downtime during deployment. + +- Azure Front Door provides both a fully managed certificate service as well as a bring-your-own-certificate method for the client-facing SSL certificates. + - The fully managed certificate service provides a simplified operational approach and helps to reduce complexity in the overall design by performing certificate management within a single area of the solution. + +- Azure Front Door auto-rotates "Managed" certificates at least 60 days ahead of certificate expiration to protect against expired certificate risks. + - If self-managed certificates are used, updated certificates should be deployed no later than 24 hours prior to expiration of the existing certificate, otherwise clients may receive expired certificate errors. + +- Certificate updates will only result in downtime if Azure Front Door is switched between "Managed" and "Use Your Own Certificate". + +- Azure Front Door is protected by Azure DDoS Protection Basic, which is integrated into Front Door by default. This provides always-on traffic monitoring, real-time mitigation, and also defends against common Layer 7 DNS query floods or Layer 3/4 volumetric attacks. + - These protections help to maintain Azure Front Door availability even when faced with a DDoS attack. + - Distributed Denial of Service (DDoS) attacks can render a targeted resource unavailable by overwhelming it with illegitimate traffic. + +- Azure Front Door also provides WAF capabilities at a global traffic level, while Application Gateway WAF must be provided within each regional deployment stamp. + - Capabilities include firewall rulesets to protect against common attacks, geo-filtering, address blocking, rate limiting, and signature matching. + + **Azure Load Balancer** + +- The Azure Basic Load Balancer SKU is not backed by an SLA and has several capability constraints compared to the Standard SKU. + +### Design Recommendations + +- Perform TLS Offloading in as few places as possible in order to maintain security whilst simplifying the certificate management lifecycle. + +- Use encrypted connections (e.g. HTTPS) from the point where TLS offloading occurs to the actual application backends. + - Application endpoints will not be visible to end users, so Azure-managed domains, such as `azurewebsites.net` or `cloudapp.net`, can be used with managed certificates. + +- For HTTP(S) traffic, ensure WAF capabilities are applied within the ingress path for all publicly exposed endpoints. + +- Enable WAF capabilities at a single service location, either globally with Azure Front Door or regionally with Azure Application Gateway, since this simplifies configuration fine tuning and optimizes performance and cost. + - Configure WAF in Prevention mode to directly block attacks. + - Only use WAF in Detection mode (i.e. only logging but not blocking suspicious requests) when the performance penalty of Prevention mode is too high. + - The implied additional risk must be fully understood and aligned to the specific requirements of the workload scenario. + +- Prioritize the use of Azure Front Door WAF since it provides the richest Azure-native feature set and applies protections at the global edge, which simplifies the overall design and drives further efficiencies. + +- Use Azure API Management only when exposing a large number of APIs to external clients or different application teams. + +- Use the Azure Standard Load Balancer SKU for any internal traffic distribution scenario within micros-service workloads. + - Provides an SLA of 99.99% when deployed across Availability Zones. + - Provides critical capabilities such as diagnostics or outbound rules. + +- Use Azure DDoS Protection Standard protection plans to help protect public endpoints hosted within each application virtual network. + +## Virtual Network Integration + +An AlwaysOn application will typically encompass requirements for integration with other applications or dependent systems, which could be hosted on Azure, another public cloud, or on-premises data centers. This application integration can be accomplished using public-facing endpoints and the internet, or private networks through network-level integration. Ultimately, the method by which application integration is achieved will have a significant impact on the security, performance, and reliability of the solution, as well as strongly impacting design decisions within other AlwaysOn design areas. + +An AlwaysOn application can be deployed within one of three overarching network configurations, which determines how application integration can occur at a network level. + +1. **Public** application **without** corporate network connectivity. +1. **Public** application **with** corporate network connectivity. +1. **Private** application **with** corporate network connectivity. + +> Within an Enterprise-Scale context, configuration 1) should be deployed within an Online Landing Zone, while both 2) and 3) should be deployed within a Corp. Connected Landing Zone to facilitate network-level integration. + +This section will therefore explore these network integration scenarios, layering in the appropriate use of Azure Virtual Networks and surrounding Azure networking services to ensure integration requirements are optimally satisfied. + +### Design Considerations + +**No Virtual Networks** + +- The simplest design approach is to not deploy the application within a virtual network. + - Connectivity between all considered Azure services will be provided entirely through public endpoints and the Microsoft Azure backbone. + - Connectivity between public endpoints hosted on Azure will only traverse the Microsoft backbone and will not go over the public internet. + - Connectivity to any external systems outside Azure will be provided by the public internet. + +- This design approach adopts "identity as a security perimeter" to provide access control between the various service components and dependent solution. + - While this may be an acceptable solution for scenarios which are less sensitive to security, having all application services and dependencies accessible via a public endpoint leaves them vulnerable to additional attack vectors orientated around gaining unauthorized access. + +- This design approach is also not applicable for all Azure services, since many services, such as AKS, have a hard requirement for an underlying virtual network. + +**Isolated Virtual Networks** + +- To mitigate the risks associated with unnecessary public endpoints, an AlwaysOn solution can be deployed within a standalone network that is not connected to other networks. + +- Incoming client requests will still require a public endpoint to be exposed to the internet, however, all subsequent communication can be within the virtual network using private endpoints. + - When using Azure Front Door Premium, it is possible to route directly from edge nodes to private application endpoints. + +- While private connectivity between application components will occur over virtual networks, all connectivity with external dependencies will still rely on public endpoints. + - Connectivity to Azure platform services can be established via Private Endpoints if supported. If other external dependencies exist on Azure, such as another downstream application, connectivity will be provided provided through public endpoints and the Microsoft Azure backbone. + - Connectivity to any external systems outside Azure would be provided by the public internet. + +- For scenarios where there are no network integration requirements for external dependencies, deploying the solution within an isolated network environment provides maximum design flexibility. + - No addressing and routing constraints associated with broader network integration. + +- Azure Bastion is a fully platform-managed PaaS service that can be deployed on an virtual network and provides secure RDP/SSH connectivity to Azure VMs. + - When you connect via Azure Bastion, virtual machines do not need a public IP address. + +- The use of application virtual networks introduces significant deployment complexities within CI/CD pipelines, since both data plane and control plane access to resources hosted on private networks is required to facilitate application deployments. + - Secure private network path must be established to allow CI/CD tooling to perform requisite actions. + - Private build agents can be deployed within application virtual networks to proxy access to resources secured by the virtual network. + +**Connected Virtual Networks** + +- For scenarios with external network integration requirements, application virtual networks can be connected to other virtual networks within Azure, another cloud provider, or on-premises networks using a variety of connectivity options. + - For example, some application scenarios might consider application-level integration with other line-of-business applications hosted privately within an on-premises corporate network. + +- The application network design must align with the broader network architecture, particularly concerning topics such as addressing and routing. + +- Overlapping IP address spaces across Azure regions or on-premises networks will create major contention when network integration is considered. + - A virtual network resource can be updated to consider additional address space, however, when a virtual network address space of a peered network changes a [sync on the peering link is required](https://azure.microsoft.com/blog/how-to-resize-azure-virtual-networks-that-are-peered-now-in-preview/) which will temporarily disable peering. + - Azure reserves five IP addresses within each subnet, which should be considered when determining appropriate sizes for application virtual networks and encompassed subnets. + - Some Azure services require dedicated subnets, such as Azure Bastion, Azure Firewall, or Azure Virtual Network Gateway. + - The size of these service subnets is very important, since they should be big enough to support all current instances of the service considering future scale requirements, but not so large as to unnecessarily waste addresses. + +- When on-premises or cross-cloud network integration is required, Azure offers two different solutions to establish a secure connection. + - An ExpressRoute circuit can be sized to provide bandwidths up to 100Gbps. + - A Virtual Private Network (VPN) can be sized to provide aggregated bandwidth up to 10Gbps in hub and spoke networks, and up to 20Gbps in Azure Virtual WAN. + +> In an Enterprise-Scale environment, the foundational platform will provide requisite connectivity to on-premises networks using ExpressRoute as well as other virtual networks in Azure using either Virtual WAN or a hub-and-spoke network design. + +- The inclusion of additional network paths and resources introduces additional reliability and operational considerations for the AlwaysOn application to ensure health is maintained. + +### Design Recommendations + +- It is recommended that AlwaysOn solutions be deployed within Azure virtual networks where possible to remove unnecessary public endpoints, limiting the application attack surface to maximize security and reliability. + - Use Private Endpoints for connectivity to Azure platform services. + - Service Endpoints can be considered for services which do not support Private Link, provided data exfiltration risks are acceptable or mitigated through alternative controls. + +- For application scenarios which do not require corporate network connectivity, treat all virtual networks as ephemeral resources that are replaced when a new regional deployment is conducted. + +- When connecting to other Azure or on-premises networks, application virtual networks should not be treated as ephemeral since it creates significant complications where virtual network peering and virtual network gateway resources are concerned. + - All relevant application resources within the virtual network should continue to be ephemeral, with parallel subnets used to facilitate blue-green deployments of updated regional deployment stamps. + +- In scenarios where corporate network connectivity is required to facilitate application integration over private networks, ensure that the IPv4 address space used for regional application virtual networks does not overlap with other connected networks and is properly sized to facilitate required scale without needing to update the virtual network resource and incur downtime. + - It is strongly recommended to only use IP addresses from the address allocation for private internets (RFC 1918). + - For environments with a limited availability of private IP addresses (RFC 1918), consider using IPv6. + - If the use of public IP address is required, ensure that only owned address blocks are used. + - Align with organization plans for IP addressing in Azure to ensure that application network IP address space does not overlap with other networks across on-premises locations or Azure regions. + - Do not create unnecessarily large application virtual networks to ensure that IP address space is not wasted. + +- Prioritize the use Azure CNI for AKS network integration, since it [supports a richer feature set](https://docs.microsoft.com/azure/aks/concepts-network#compare-network-models). + - Consider Kubenet for scenarios with a limited rage available IP addresses to fit the application within a constrained address space. + +- For scenarios requiring on-premises network integration, prioritize the use Express Route to ensure secure and scalable connectivity. + - Ensure the reliability level applied to the Express Route or VPN fully satisfies application requirements. + - Multiple network paths should be considered to provide additional redundancy when required, such as cross connected ExpressRoute circuits or the use of VPN as a failover connectivity mechanism. + +- Ensure all components on critical network paths are in line with the reliability and availability requirements of associated user flows, regardless of whether the management of these paths and associated component is delivered by the application team of central IT teams. + +> When integrating with a broader organizational network topology, careful consideration should be given to the [Enterprise Scale Landing Zones Network Guidance](https://docs.microsoft.com/azure/cloud-adoption-framework/ready/enterprise-scale/network-topology-and-connectivity) to ensure the foundational network is aligned with Microsoft best-practices. + +- Use [Azure Bastion](https://docs.microsoft.com/azure/bastion/bastion-overview) or proxied private connections to access the data plane of Azure resources or perform management operations. + +## Internet Egress + +Internet egress is a foundational network requirement for an AlwaysOn application to facilitate external communication in the context of: + +1. Direct application user interaction. +1. Application integration with external dependencies outside Azure. +1. Access to external dependencies required by the Azure services leveraged by the application. + +This section will therefore explore how internet egress can be achieved while ensuring security, reliability, and sustainable performance are maintained, highlighting key egress requirements for services recommended in an AlwaysOn mission-critical context. + +### Design Considerations + +- Many Azure services require access to public endpoints for various management and control plane functions to operate as intended. + +- Azure provides different direct internet outbound [connectivity methods](https://docs.microsoft.com/azure/load-balancer/load-balancer-outbound-connections#scenarios), such as Azure NAT gateway or Azure Load Balancer, for virtual machines or compute instances on a virtual network. + +- When traffic from inside a virtual network travels out to the Internet, Network Address Translation (NAT) must take place. + - This is a compute operation that occurs within the networking stack and that can therefore impact system performance. + +- When NAT takes place at a small scale the performance impact should be negligible, however, if there are a large number of outbound requests network issues may occur. + - These issues typically come in the form of 'Source NAT (or SNAT) port exhaustion'. + +- In a multi-tenant environment, such as Azure App Service, there is a limited number of outbound ports available to each instance. If these ports run out, no new outbound connections can be initiated. + - This issue can be mitigated by reducing the number of private/public edge traversals or by using a more scalable NAT solution such as the [Azure NAT Gateway](https://docs.microsoft.com/azure/virtual-network/nat-gateway/nat-overview). + +- In addition to NAT limitations, outbound traffic may also be subject to requisite security inspections. + - Azure Firewall provides appropriate security capabilities to secure network egress. + + - [Azure Firewall](https://docs.microsoft.com/azure/firewall/protect-azure-kubernetes-service) (or an equivelent NVA) can be used to secure Kubernetes egress requirements by providing granular control over outbound traffic flows. + +- Large volumes of internet egress will incur [data transfer charges](https://azure.microsoft.com/pricing/details/bandwidth/). + +**Azure NAT Gateway** + +- Azure NAT Gateway supports 64,000 connections for TCP and UDP per assigned outbound IP address. + - Up to 16 IP addresses can be assigned to a single NAT gateway. + - A default TCP idle timeout of 4 minutes. + - If idle timeout is altered to a higher value, flows will be held for longer which will increase the pressure on the SNAT port inventory. + +- NAT gateway cannot provide zonal isolation out-of-the-box. + - To get zonal redundancy, a subnet containing zonal resources must be aligned with corresponding zonal NAT gateways. + +### Design Recommendations + +- Minimize the number of outgoing Internet connections as this will impact NAT performance. + - If large numbers of internet-bound connections are required, consider using [Azure NAT Gateway](https://docs.microsoft.com/azure/virtual-network/nat-gateway/nat-overview) to abstract outbound traffic flows. + +- Use Azure Firewall where requirements to control and inspect outbound internet traffic exist. + - Ensure Azure Firewall is not used to inspect traffic between Azure services. + +> In an Enterprise-Scale context, consider using the foundational platform Azure Firewall resource (or equivalent NVA). +> - If a dependency is taken on a central platform resource for internet egress, then the reliability level of that resource and associated network path should be closely aligned with application requirements. Operational data from the resource should also be made available to the application in order to inform potential operational action in failure scenarios. + +> If there are high-scale requirements associated with outbound traffic, consideration should be given to a dedicated Azure Firewall resource for a mission-critical AlwaysOn application, to mitigate risks associated with using a centrally shared resource, such as noisy neighbor scenarios. +> - When deployed within a Virtual WAN environment, consideration should be given to Firewall Manager to provide centralized management of dedicated application Azure Firewall instances to ensure organizational security postures are observed through global firewall policies. +> - Ensure incremental firewall policies are delegated to application security teams via role-based access control to allow for application policy autonomy. + +## Inter-Zone and Inter-Region Connectivity + +While the AlwaysOn application design strongly advocates independent regional deployment stamps, many application scenarios may still require network integration between application components deployed within different zones or Azure regions, even if only under degraded service circumstances. The method by which inter-zone and inter-region communication is achieved has a significant bearing on overall performance and reliability, which will be explored through the considerations and recommendations within this section. + +### Design Considerations + +- The application design approach for an AlwaysOn application endorses the use of independent regional deployments with zonal redundancy applied at all component levels within a single region. + +- An [Availability Zone (AZ)](https://docs.microsoft.com/azure/availability-zones/az-overview#availability-zones) is a physically separate data center location within an Azure region, providing physical and logical fault isolation up to the level of a single data center. + - A round-trip latency of less than 2ms is guaranteed for inter-zone communication. + - Zones will have a small latency variance given varied distances and fiber paths between zones. + +- Availability Zone connectivity depends on regional characteristics, and therefore traffic entering a region via an edge location may need to be routed between zones to reach its destination. + - This will add a ~1ms-2ms latency given inter-zone routing and 'speed of light' constraints, but this should only have a bearing for hyper sensitive workloads. + +- Availability Zones are treated as logical entities within the context of a single subscription, so different subscriptions might have a different zonal mapping for the same region. + - For example, zone 1 in Subscription A could correspond to the same physical data center as zone 2 in subscription B. + +- Communication between zones within a region incurs a [data transfer charge](https://azure.microsoft.com/pricing/details/bandwidth/) per GB of bandwidth. + +- With application scenarios that are extremely chatty between application components, spreading application tiers across zones can introduce significant latency and increased costs. + - It is possible to mitigate this within the design by constraining a deployment stamp to a single zone and deploying multiple stamps across the different zones. + +- Communication between different Azure regions incurs a larger [data transfer charge](https://azure.microsoft.com/pricing/details/bandwidth/) per GB of bandwidth. + - The applicable data transfer rate depends on the continent of the considered Azure regions. + - Data traversing continents is charged at a considerably higher rate. + +- Express Route and VPN connectivity methods can also be used to directly connect different Azure regions together for certain scenarios, or even different cloud platforms. + +- For services to service communication Private Link can be used for direct communication using private endpoints. + +- Traffic can be hair-pinned through Express Route circuits used for on-premise connectivity in order to facilitate routing between virtual networks within an Azure region as well as across different Azure regions within the same geography. + - Hair-pinning traffic through Express Route will bypass data transfer costs associated with virtual network peering, so can be used as a way to optimize costs. + - This approach necessitates additional network hops for application integration within Azure, which introduces latency and reliability risks. + - Expands the role of Express Route and associated gateway components from Azure/on-premises to also encompass Azure/Azure connectivity. + +- When sub-millisecond latency are required between services, [Proximity Placement Groups](https://docs.microsoft.com/azure/virtual-machines/co-location) can be used when supported by the services used. + +### Design Recommendations + +- Use virtual network peering to connect networks within a region and across different regions. + - It is strongly recommended to avoid hair-pinning within Express Route. + +- Use Private Link to establish communication directly between services in the same region or across regions (service in Region A communicating with service in Region B. + +- For application workloads which are extremely chatty between components, consider constraining a deployment stamp to a single zone and deploying multiple stamps across the different zones. + - This ensures zonal redundancy is maintained at the level of an encapsulated deployment stamp rather than a single application component. + +- Where possible, treat each deployment stamp as independent and disconnected from other stamps. + - Use data platform technologies to synchronize state across regions rather than achieving consistency at an application level with direct network paths. + - Avoid 'dog legging' traffic between different regions unless necessary, even in a failure scenario. + - Use global routing services and end-to-end health probes to take an entire stamp out of circulation in the event that a single critical component tier fails, rather than routing traffic at that faulty component level to another region. + +- For hyper latency sensitive application scenarios, prioritize the use of zones with regional network gateways to optimize network latency for ingress paths. + +## Micro-Segmentation and Kubernetes Network Policies + +Micro-segmentation is a network security design pattern used to isolate and secure individual application workloads, with policies applied to limit network traffic between workloads based on a Zero Trust model. It is typically applied to reduce network attack surface, improve breach containment, and strengthen security through policy-driven application-level network controls. + +An AlwaysOn application can enforce application-level network security using Network Security Groups (NSG) at either a subnet or network interface level, service Access Control Lists (ACL), and network policies when using Azure Kubernetes Service (AKS). This section will therefore explore the optimal use of these capabilities, providing key considerations and recommendations to achieve application-level micro-segmentation. + +### Design Considerations + +- AKS can be deployed in two different [networking models](https://docs.microsoft.com/azure/aks/concepts-network#azure-cni-advanced-networking). + - Kubenet networking: AKS nodes are integrated within an existing virtual network, but pods exist within a virtual overlay network on each node. + - Azure Container Networking Interface (CNI) networking: the AKS cluster is integrated within an existing virtual network resources and treated as a single large network, with pods attributed their own IP address. + +- By default, pods are non-isolated and accept traffic from any source and can send traffic to any destination; a pod can communicate with every other pod in a given Kubernetes cluster; Kubernetes does not ensure any network level isolation, and does not isolate namespaces at the cluster level. + +- Pods can be isolated using [Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/). + - AKS provides two ways to implement Network Policy, and both implementations use Linux IPTables to enforce specified policies. + - _Azure Network Policies_ + - _Calico Network Policies_ + - Network policies do not conflict since they are additive. + - For a network flow between two pods to be allowed, both the egress policy on the source pod and the ingress policy on the destination pod need to allow the traffic. + - The network policy feature can only be enabled at cluster instantiation time. + - It is not possible to enable network policy on an existing AKS cluster. + +The delivery of network policies is consistent regardless of whether Azure or Calico is used. + - Calico provides a [richer feature set](https://docs.microsoft.com/azure/aks/use-network-policies#differences-between-azure-and-calico-policies-and-their-capabilities), including support for windows-nodes. + - Calico introduces another party within the support ecosystem; either Calico community or paid support. + - 1st-party Azure support is provided for Azure network policies. + +> Network Policy is a Kubernetes specification that defines access policies for communication between Pods. Using Network Policies, an ordered set of rules can be defined to control how traffic is sent/received, and applied to a collection of pods that match one or more label selectors. + +- AKS supports the creation of different node pools to separate different workloads using nodes with different hardware and software characteristics, such as nodes with and without GPU capabilities. + - Using node pools does not provide any network-level isolation. + - All node pools must reside within the same virtual network and separating node pools in different subnets is [currently in preview](https://github.com/Azure/AKS/issues/1338). + - NSGs at the subnet can be applied to implement micro-segmentation between node pools. + +### Design Recommendations + +- Configure an NSG on all considered subnets to provide an IP ACL to secure ingress paths and isolate application components based on a Zero Trust model. + - Use Front Door Service Tags within NSGs on all subnets containing application backends defined within Azure Front Door, since this will validate traffic originates from a legitimate Azure Front Door backend IP address space. + - This will ensure traffic flows through Azure Front Door at a service level, but header based filtering will still be required to ensure the use of a particular Front Door instance and to also mitigate 'IP spoofing' security risks. + - Public internet traffic should be disabled on RDP and SSH ports across all applicable NSGs. + +- Enable [Network Policy](https://docs.microsoft.com/azure/aks/use-network-policies) for Azure Kubernetes Service at deployment time. + - The Network Policy feature in Kubernetes should be used to define rules for ingress and egress traffic between pods in a cluster. + - Define granular Network Policies to restrict and limit cross-pod communication. + +- Prioritize the use of _Calico Network Policies_ because it provides a richer feature set with broader community adoption and support. + +--- + +|Previous Page|Next Page| +|:--|:--| + [Deployment and Testing](./Deployment-Testing.md) |[Security](./Security.md) + --- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Operational-Procedures.md b/docs/design-methodology/Operational-Procedures.md new file mode 100644 index 00000000..442a14c5 --- /dev/null +++ b/docs/design-methodology/Operational-Procedures.md @@ -0,0 +1,175 @@ +# Operational procedures + +The AlwaysOn design methodology leans heavily on the principles *automation wherever possible* and *configuration as code* to drive reliable and effective operations through DevOps processes, with automated deployment pipelines used to execute versioned application and infrastructure code artifacts within a source repository. While this level of DevOps adoption requires substantial engineering investment to instantiate and discipline to maintain, it yields significant operational dividends, enabling consistent and accurate operational outcomes within minimal manual operational procedures. + +- [DevOps Processes](#devops-processes) +- [Application Operations](#application-operations) + +## DevOps processes + +DevOps is a fundamental characteristic of the AlwaysOn design methodology, providing the engineering mindset, processes, and tooling to deliver application services in a fast, efficient, and reliable manner. More specifically, DevOps brings together development and operational processes as well as teams into a single engineering function that encompasses the entire application lifecycle, leveraging automation and DevOps tooling to conduct deployment operations swiftly and reliably. This section will therefore explore how the adoption of DevOps and related deployment methods is used to drive effective and consistent operational procedures. + +### Design considerations + +- DevOps processes support and sustain the concepts of continuous integration and continuous deployment (CI/CD), while fostering a culture of continuous improvement. + +- DevOps can be difficult to apply when there are hard dependencies on central IT functions since it prevents end-to-end operational action. + +- Key responsibilities of the DevOps team for an AlwaysOn application include: + - Create and manage application and infrastructure resources through CI/CD automation. + - Application monitoring and observability. + - Azure RBAC and identity for application components. + - Security monitoring and audit of application resources. + - Network management for application components. + - Cost management for application resources. + +- DevSecOps expands the DevOps model by integrating security and quality assurance teams with development and operations throughout the application lifecycle. + +- A DevOps engineering team can consider a variety of granular Azure RBAC roles for different technical personas, such as AppDataOps for database management. + - A zero trust model can and should be applied across different application DevOps personas. + +### Design recommendations + +- Define configuration settings and updates for application components or underlying infrastructure as code. + - Manage any changes to code through consistent release and update process, including tasks such as key or secret rotation and permission management. + - Prioritize pipeline-managed update processes, such as with scheduled pipeline runs, over built-in auto-update mechanisms. + +- Do not use central processes or provisioning pipelines for the instantiation or management of AlwaysOn application resources, since this introduces external application dependencies and additional risk vectors, such as those associated with 'noisy neighbor' scenarios. + - If centralized provisioning processes are mandated, ensure the availability requirements of leveraged dependencies are fully in-line with application requirements, and ensure operational transparency is provided to allow for holistic operationalization of the end-to-end application. + +- Embrace continuous improvement and allocate a proportion of engineering capacity within each sprint to optimize platform fundamentals. + - Consider allocating 20-40% of capacity within each sprint to drive fundamental platform improvements and bolster reliability. + +- To accelerate the development of new services, consider the creation of a common engineering criteria and reference architectures/libraries for service teams to leverage, ensuring consistent alignment with core AlwaysOn principles. + - Enforce a consistent baseline configurations for reliability, security, and operations through a policy-driven approach using Azure Policy. + +> This common engineering criteria and associated artifacts, such as Azure Policies and Terraform for common design patterns, can also be leveraged across other workloads within the broader application ecosystem for an organization. + +- Consider a DevSecOps for security sensitive and highly-regulated scenarios, to ensure security is baked within the DNA of engineering team throughout the development lifecycle rather than a specific release stage/gate. + +- Apply a zero trust model within critical application environments, leveraging capabilities such as Azure AD Privileged Identity Management (PIM) to ensure consistent operations only occur through CI/CD processes or automated operational procedures. + - No physical users should have standing write-access to any environment, maybe with the exception of development environments for easier testing and debugging. + +- Define emergency processes for Just-in-time access to production environments. + - Ensure 'break glass' accounts exist in the event of serious issues with the authentication provider. + +- Consider AIOps as a method to continually improve operational procedures and triggers. + +## Application Operations + +The application design and platform recommendations provided within the AlwaysOn design methodology have a significant bearing on effective operations, and the extent to which these recommendations are adhered to will therefore greatly influence an optimal operational procedures. + +Furthermore, there is a varied set of operational capabilities provided by different Azure services, particularly when it comes to high availability and recovery. It is therefore also important to understand and leverage the operational capabilities of leveraged services. + +This section will therefore highlight key operational aspects associated with AlwaysOn application design and recommended platform services. + +### Design considerations + +- Azure services provide a combination of built-in (enabled by default) and configurable platform capabilities, such as zonal redundancy or geo-replication. The service-level configuration of each service within an application must therefore be considered by operational procedures. + - Many configurable capabilities incur an additional cost, such as the multi-write deployment configuration for Cosmos DB. + +- AlwaysOn strongly endorses the principle of ephemeral stateless application resources, meaning that updates can typically be performed through a new deployment and standard delivery pipelines. + +- The vast majority of the required operations are exposed and accessible via the Azure ARM management APIs or through the Azure Portal. + +- Some more intensive operations, such as a restore from a periodic backup Cosmos DB or the recovery of a deleted resource, can only be performed by Azure Support Engineers via a Support Case. + +- For stateless resources and resources which can be entirely configured from deployment, such as Azure Front Door and associated backends/origins, re-deployment will faster generally result in an operational resource than a Support process to attempt recovery of the deleted resource. + +- Azure Policy provides a framework to enforce and audit security and reliability baselines, ensuring continued compliance with a common engineering criteria for an AlwaysOn application. More specifically, Azure Policy forms a key part of the Azure Resource Manager (ARM) control plane, supplementing RBAC by restricting what actions authorized users can perform, and can be leveraged to enforce vital security and reliability conventions across utilized platform services. + - Azure Policy can be extended within Azure Kubernetes Service (AKS) via [Azure Policy for Kubernetes](https://docs.microsoft.com/azure/governance/policy/concepts/policy-for-kubernetes) which provides visibility of components deployed within clusters. + +- Azure [resources can be locked](https://docs.microsoft.com/azure/azure-resource-manager/management/lock-resources) to prevent them from being modified or deleted. + - Locks introduce management overhead within deployment pipelines which must remove locks, perform deployment steps, before subsequently re-enabling. + - Generally, for most resource a robust RBAC process, with tight restrictions on who can perform write operations, should be preferred over locking resources. + +**Update management** + +- Key, secret, and certificate expiry are common causes of application outage. + +- The Kubernetes version within AKS needs to be updated on a regular basis, especially given support for older versions is not sustained. + - Components running on K8s also need to be updated, such as cert-manager and Key Vault-csi, and aligned with the k8s version within AKS. + +- Terraform providers need to be updated on a regular basis. Newer provider versions frequently contain breaking changes which must be properly tested before applied in production. + +- Other application dependencies, such as the runtime environment (.NET, Java, Python), should be be monitored and kept up-to-date. + +- New versions of packages, components, and dependencies need to be properly tested before consideration in a production context. + +- Container registries will likely need regular housekeeping to delete old image versions that are not used anymore. + +- Azure Policy provides native support for a wide variety of Azure resources. + +### Design recommendations + +- Use an active-active deployment model, leveraging a health model and automated scale-operations to ensure no failover intervention is required. + - If using an active-passive or active-standby model, ensure failover procedures are automated or at least codified within pipelines so that manual steps besides triggering is required during operational crises. + +- Prioritize the use of Azure-native auto-scale functionality for all available services. + - Establish automated operational processes to scale services which do not offer auto-scale. + - Leverage scale-units composed of multiple services to provide requisite scalability under relevant circumstances. + +- Identify operational procedures and tasks required by global (long-lived) application resources, such as Cosmos DB or ACR in the AlwaysOn reference implementation. + - For example, in the event that a Cosmos DB resource or encompassed data is incorrectly modified or deleted, the possible ways of recovery should be well understood and a recovery process should exist. + - Similarly, establish procedures to manage decommissioned container images in the registry. + +- It is strongly recommended to practice recovery operations in advance, on non-production resources and data, as part of standard business continuity preparations. + +- Use platform-native capabilities for backup and restore, ensuring they are aligned with RTO/RPO and data retention requirements. + - Avoid building custom solutions unless absolutely necessary. + - Define a strategy for long-term backup retention if required. + +- Use built-in capabilities for SSL certificate management and renewal, such as those offered by Azure Front Door. + +- Wherever possible, use Managed Identities in order to avoid dealing with Service Principal credentials or API keys. + +- When storing secrets, keys or certificates in Azure Key Vault, make use of the expiry setting and have [alerting configured](https://docs.microsoft.com/azure/key-vault/general/event-grid-tutorial) for upcoming expirations. + - All key, secret, and certificate updates should be performed using the standard release process. + +- Identify critical operational alert and define target audiences and systems, with clear channels to reach them. + - Avoid 'white-noise' by only sending actionable alerts, to prevent operational stakeholders for ignoring alerts and missing important information. + - Leverage continuous improvement to optimize alerting and remove observed 'white-noise'. + +- Apply the principle of policy-driven governance and Azure Policy to ensure the appropriate use of operational capabilities and a reliable configuration baselines across all application services. + +- Apply a resource lock to prevent the deletion of long-lived global resources, such as Azure Cosmos DB. + - Avoid the use of resource locks on ephemeral regional resources, and instead rely on the appropriate use of Role Based Access Control (RBAC) and CI/CD pipelines to control operational updates. + +**Update management** + +- Update external libraries, SDKs, and runtimes frequently, treating it as any other change to the application. This will ensure the latest security vulnerabilities and performance optimizations are applied. + - Ensure all updates are validated prior to production release. + - Setup processes to monitor and automatically detect updates, such as [GitHub's Dependabot](https://github.com/dependabot). + +- All operational tasks, such as key and secret rotation, should be handled using either Azure-native platform capabilities or via a standard release process applied for code and configuration changes. + - Ensure key, secret, and certificate rotation is performed on a regular basis. + +- Manual operational changes to update components should be avoided and only considered by emergency exception. + - Ensure a process exists to reconciliate any manual changes back into the source repository, avoiding drift and issue recurrence. + - Establish an automated housekeeping procedure to [remove old image versions from Azure Container Registry](https://docs.microsoft.com/azure/container-registry/container-registry-auto-purge). + +--- + +|Previous Page| +|:--| +|[Security](./Security.md) | + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Principles.md b/docs/design-methodology/Principles.md new file mode 100644 index 00000000..e91da93f --- /dev/null +++ b/docs/design-methodology/Principles.md @@ -0,0 +1,103 @@ +# Design principles + +The AlwaysOn architectural framework presented within this repository is underpinned by 5 key design principles which serve as a compass for subsequent design decisions across technical domains and the critical design areas. Readers are strongly advised to familiarize themselves with these principles to better understand their impact and the trade-offs associated with non-adherence. + +1. **Maximum Reliability** - Fundamental pursuit of the most reliable solution, ensuring trade-offs are properly understood. +1. **Sustainable Performance and Scalability** - Design for scalability across the end-to-end solution without performance bottlenecks. +1. **Operations by Design** - Engineered to last with robust and assertive operational management. +1. **Cloud-Native Design** - Focus on using native platforms services to minimize operational burdens, while mitigating known gaps. +1. **Always Secure** - Design for end-to-end security to maintain application stability and ensure availability. + +[![AlwaysOn Design Principles](/docs/media/alwayson-design-principles.png "AlwaysOn Design Principles")](./Principles.md) + +## Maximum reliability + +- **Design for failure** - Failure is impossible to avoid in a highly distributed multi-tenant cloud environment like Azure. By anticipating failures and cascading or correlated impact, from individual components to entire Azure regions, a solution can be designed and developed in a resilient manner. + +- **Observe application health** - Before issues impacting application reliability can be mitigated, they must first be detected. By monitoring the operation of an application relative to a known healthy state it becomes possible to detect or even predict reliability issues, allowing for swift remedial action to be taken. + +- **Drive automation** - One of the leading causes of application downtime is human error, whether that be due to the deployment of insufficiently tested software or misconfiguration. To minimize the possibility and impact of human errors, it is vital to strive for automation in all aspects of a cloud solution to improve reliability; automated testing, deployment, and management. + +- **Design for self-healing** - Self healing describes a system's ability to deal with failures automatically through pre-defined remediation protocols connected to failure modes within the solution. It is an advanced concept that requires a high level of system maturity with monitoring and automation, but should be an aspiration from inception to maximize reliability. + +## Sustainable performance and scalability + +- **Design for scale-out** - Scale-out is a concept that focuses on a system's ability to respond to demand through horizontal growth. This means that as traffic grows, more resource units are added in parallel instead of increasing the size of the existing resources. A systems ability to handle expected and unexpected traffic increases through scale-units is essential to overall performance and reliability by further reducing the impact of a single resource failure. + +- **Model capacity** - The system's expected performance under various load profiles should be modeled through load and performance tests. This capacity model enables planning of resource scale levels for a given load profile, and additionally exposes how system components perform in relation to each other, therefore enabling system-wide capacity allocation planning. + +- **Test and experiment often** - Testing should be performed for each major change as well as on a regular basis. Such testing should be performed in testing and staging environments, but it can also be beneficial to run a subset of tests against the production environment. Ongoing testing validates existing thresholds, targets and assumptions and will help to quickly identify risks to resiliency and availability. + +- **Baseline performance and identify bottlenecks** - Performance testing with detailed telemetry from every system component allows for the identification of bottlenecks within the system, including components which need to be scaled in relation to other components, and this information should be incorporated into the capacity model. + +- **Use containerized or serverless architecture** - Using managed compute services and containerized architectures significantly reduces the ongoing administrative and operational overhead of designing, operating, and scaling applications by shifting infrastructure deployment and maintenance to the managed service provider. + +## Operations by design + +- **Loosely coupled components** - Loose coupling enables independent and on-demand testing, deployments, and updates to components of the application while minimizing inter-team dependencies for support, services, resources, or approvals. + +- **Optimize build and release process** - Fully automated build and release processes reduce the friction and increase the velocity of deploying updates, bringing repeatability and consistency across environments. Automation shortens the feedback loop from developers pushing changes to getting automated near instantaneous insights on code quality, test coverage, security, and performance, which increases developer productivity and team velocity. + +- **Understand operational health** - Full diagnostic instrumentation of all components and resources enables ongoing observability of logs, metrics and traces, and enables health modeling to quantify application health in the context to availability and performance requirements. + +- **Rehearse recovery and practice failure** - Business Continuity (BC) and Disaster Recovery (DR) planning and practice drills are essential and should be conducted periodically, since learnings from drills can iteratively improve plans and procedures to maximize resiliency in the event of unplanned downtime. + +- **Embrace continuous operational improvement** - Prioritize routine improvement of the system and user experience, leveraging a health model to understand and measure operational efficiency with feedback mechanisms to enable application teams to understand and address gaps in an iterative manner. + +## Cloud native design + +- **Azure-native managed services** - Azure-native managed services are prioritized due to their lower administrative and operational overhead as well as tight integration with consistent configuration and instrumentation across the application stack. + +- **Roadmap alignment** - Incorporate upcoming new and improved Azure service capabilities as they become Generally Available (GA) to stay close to the leading edge of Azure. + +- **Embrace preview capabilities and mitigate known gaps** - While Generally Available (GA) services are prioritized for supportability, Azure service previews are actively explored for rapid incorporation, providing technical and actionable feedback to Azure product groups to address gaps. + +- **Landing Zone alignment** - Deployable within an [Azure Landing Zone](https://docs.microsoft.com/azure/cloud-adoption-framework/ready/landing-zone/) and aligned to the Azure Landing Zone design methodology, but also fully functional and deployable in a bare environment outside of a Landing Zone. + +## Always secure + +- **Monitor the security of the entire solution and plan incident responses** - Correlate security and audit events to model application health and identify active threats. Establish automated and manual procedures to respond to incidents leveraging Security Information and Event Management (SIEM) tooling for tracking. + +- **Model and test against potential threats** - Ensure appropriate resource hardening and establish procedures to identify and mitigate known threats, using penetration testing to verify threat mitigation, as well as static code analysis and code scanning. + +- **Identify and protect endpoints** - Monitor and protect the network integrity of internal and external endpoints through security capabilities and appliances, such as firewalls or web application firewalls. Use industry standard approaches to protect against common attack vectors like Distributed Denial-Of-Service (DDoS) attacks such as SlowLoris. + +- **Protect against code level vulnerabilities** - Identify and mitigate code-level vulnerabilities, such as cross-site scripting or SQL injection, and incorporate security patching into operational lifecycles for all parts of the codebase, including dependencies. + +- **Automate and use least privilege** - Drive automation to minimize the need for human interaction and implement least privilege across both the application and control plane to protect against data exfiltration and malicious actor scenarios. + +- **Classify and encrypt data** - Classify data according to risk and apply industry standard encryption at rest and in transit, ensuring keys and certificates are stored securely and managed properly. + +# Additional project principles + +- **Production ready artifacts**: Every AlwaysOn technical artifact will be ready for use in production environments with all end-to-end operational aspects considered. + +- **Rooted in 'customer truth'** - All technical decisions will be guided by the experience customers have on the platform and the feedback they share. + +- **Azure roadmap alignment** - The AlwaysOn architecture will have its own roadmap that is aligned with Azure product roadmaps. + +--- + +|Previous Page|Next Page| +|--|--| +|[How to use the AlwaysOn Design Guidelines](./README.md)|[AlwaysOn Design Areas](./Design-Areas.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/README.md b/docs/design-methodology/README.md new file mode 100644 index 00000000..ad4b6f42 --- /dev/null +++ b/docs/design-methodology/README.md @@ -0,0 +1,79 @@ +# How to Use the AlwaysOn design methodology + +The AlwaysOn design methodology is intended to define easy to follow guidance surrounding critical design decisions which must be made to produce a target AlwaysOn architecture. + +## Critical design path + +At the heart of an AlwaysOn target architecture definition lies a critical design path, comprised of 5 foundational [design principles](./Principles.md) and 8 [fundamental design areas](./Design-Areas.md) with heavily interrelated and dependent design decisions. + +Ultimately, the impact of decisions made within each area will reverberate across other design areas and design decisions. Readers are strongly advised to familiarize themselves with these 8 critical design areas, reviewing provided considerations and recommendations to better understand the consequences of encompassed decisions, which may later produce trade-offs or unforeseen consequences within related areas. For example, to define a target architecture it is critical to determine how best to monitor application health across key components. In this instance, the reader should review the Health Modelling design area, using the outlined recommendations to help drive decisions. + +## Design for business requirements + +Not all business-critical applications have the same requirements, and as a result the review considerations and design recommendations provided by the AlwaysOn design methodology may yield different design decisions and trade-offs which is to be expected. + +### Reliability tiers + +Reliability is a subjective concept and for an application to be appropriately reliable it must reflect the business requirements surrounding it. For example, a mission-critical application with a 99.999% availability Service Level Objective (SLO) requires a much higher level of reliability and operational rigour than another application with an SLO of 99.9%. However, there are obvious financial and opportunity cost implications for introducing greater reliability, and such trade-offs should be carefully considered. + +The AlwaysOn design methodology leverages several reliability tiers orientated around an availability SLO to introduce further specificity to design recommendations. These tiers are therefore intended as a reference to help readers better navigate requisite design decisions to achieve required levels of reliability. + +|Reliability Tier (Availability SLO)|Permitted Downtime (Week)|Permitted Downtime (Month)|Permitted Downtime (Year)| +|--|--|--|--| +|99.9%|10 minutes, 4 seconds|43 minutes, 49 seconds|8 hours, 45 minutes, 56 seconds| +|99.95%|5 minutes, 2 seconds|21 minutes, 54 seconds|4 hours, 22 minutes, 58 seconds| +|99.99%|1 minutes|4 minutes 22 seconds|52 minutes, 35 seconds| +|99.999%|6 seconds|26 seconds|5 minutes, 15 seconds| +|99.9999%|<1 second|2 seconds|31 seconds| + +> It is important to note that AlwaysOn considers availability to be more than simple uptime, but rather a consistent level of application service relative to a known healthy application state which is captured by a codified health model. + +The pursuit of a particular reliability tier ultimatly has a significant bearing on the critical design path and encompassed design decisions, resulting in a different target architecture. + +The image below demonstrates how the different reliability tiers and underlying business requirements influence the target architecture for the foundational reference implementation, particularly concerning the number of regional deployments and utilised global technologies. + +[![AlwaysOn Reliability Tiers](/docs/media/alwayson-reliability-tiers.png "AlwaysOn Reliability Tiers")](./README.md) + +[![AlwaysOn Availability Targets](/docs/media/alwayson-slo.gif "AlwaysOn Availability Targets")](./README.md) + +### Opportunity cost + +There is a opportunity cost associated with achieving an AlwaysOn application design since it requires significant engineering investment in fundamental reliability concepts, such as fully embracing Infrastructure as Code, deployment automation, and chaos engineering. This comes at a cost in terms of time/effort which could be invested elsewhere to deliver new application functionality and features. + +Furthermore, maximizing reliability with an AlwaysOn application design can also have a significant bearing on financial costs, primarily through the duplication of resources and the distribution of resources across regions to achieve high availability. To avoid excess costs, it is highly recommended that **AlwaysOn solutions not be over-engineered/over-optimized/over-provisioned beyond relevant business requirements**. + +## Synthetic application construction + +In parallel to design activities, it is highly recommended that a synthetic AlwaysOn application environment be established using the [foundational-online](https://github.com/Azure/AlwaysOn-Foundational-Online) and [foundational-connected](https://github.com/Azure/AlwaysOn-Foundational-Connected) reference implementations. + +This provides hands-on opportunities to validate design decisions by replicating the target architecture, allowing for design uncertainty to be swiftly assessed. If applied correctly with representative requirement coverage, most problematic issues likely to hinder progress can be uncovered and subsequently addressed. + +## Target architecture evolution + +Application architectures established using the AlwaysOn design methodology must continue to evolve in alignment with Azure platform roadmaps to support optimized sustainability. + +--- + +|Previous Page|Next Page| +|--|--| +|[Introduction to AlwaysOn](../introduction/README.md)|[AlwaysOn Design Principles](./Principles.md) + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/design-methodology/Security.md b/docs/design-methodology/Security.md new file mode 100644 index 00000000..0379057f --- /dev/null +++ b/docs/design-methodology/Security.md @@ -0,0 +1,366 @@ +# Security + +Security is paramount for any mission-critical application, particularly given the myriad of threat actors that exist in present-day society. Security must therefore be treated as a first-class concern within the AlwaysOn architectural process, to ensure that security services and practices are interwoven within the solution. + +Security is ultimately an extremely broad technical domain, encompassing a variety of threat vectors that collectively stretch across the entire application stack. However, given the primary aspiration of AlwaysOn is to maximize reliability for application scenarios which must remain performant and available, the security lens applied within this design area will focus on mitigating threats with the capacity to impact availability and hinder overall reliability. For example, how an application mitigates attack vectors such as DDoS and Slowloris will have a critical bearing on overall reliability, since successful DDoS attacks will have a catastrophic impact on availability and performance. Hence, an application must be fully protected against threats intended to directly or indirectly compromise application reliability to be truly 'always on'. + +It is also important to note that there are often significant trade-offs associated with a hardened security posture, particularly with respect to performance, operational agility, and in some cases reliability. For example, the inclusion of inline Network Virtual Appliances (NVA) for Next-Generation Firewall (NGFW) capabilities, such as deep packet inspection, will introduce a significant performance penalty, additional operational complexity, and a reliability risk if scalability and recovery operations are not closely aligned with that of the application. It is therefore essential that additional security components and practices intended to mitigate key threat vectors are also designed to support the reliability target of an AlwaysOn application, which will form a key aspect of the recommendations and considerations presented within this section. + +- [Zero Trust](#zero-trust) +- [Threat Modeling](#threat-modeling) +- [Network Intrusion Protection](#network-intrusion-protection) +- [Data Integrity Protection](#data-integrity-protection) +- [Policy Driven Governance](#policy-driven-governance) + +## Zero Trust + +The [Zero Trust](https://www.microsoft.com/security/business/zero-trust) security model provides a proactive and integrated approach to applying security across all layers of an application estate, to explicitly and continuously verify every transaction, assert least privilege, leverage intelligence and advanced detection to respond to threats in near real-time. It is ultimately centered on eliminating trust inside and outside of application perimeters, enforcing verification for anything attempting to connect to the system. + +- **Verify explicitly**: Always authenticate and authorize based on all available data points, including user identity, location, device health, service or workload, data classification, and anomalies. + +- **Use least privileged access**: Limit user access with just-in-time and just-enough-access (JIT/JEA), risk-based adaptive polices, and data protection to help secure both data and productivity. + +- **Assume breach**: Minimize blast radius and segment access. Verify end-to-end encryption and use analytics to get visibility, drive threat detection, and improve defenses. + +> The AlwaysOn design methodology and foundational reference implementation adopt a Zero Trust model to structure and guide the security design and implementation approach. + +### Design considerations + +- Continuous security testing to validate mitigations for key security vulnerabilities. + - *Is security testing performed as a part of automated CI/CD processes?* + - *If not, how often is specific security testing performed?* + - *Are test outcomes measured against a desired security posture and threat model?* + +- Security level across all lower-environments. + - *Do all environments within the development lifecycle have the same security posture as the production environment?* + +- Authentication and Authorization continuity in the event of a failure. + - *If authentication or authorization services are temporarily unavailable, will the application be able to continue to operate?* + +- Azure provides [Azure AD](https://docs.microsoft.com/azure/active-directory/fundamentals/active-directory-whatis) and [Azure AD B2C](https://docs.microsoft.com/azure/active-directory-b2c/overview) services for application user authorization. + +- Automated security compliance and remediation. + - *Can changes to key security settings be detected*? + - *Are responses to remediate non-compliant changes automated?* + +- Secret management and the risk associated with leakage. + - Secret scanning to detect secrets before code is committed to prevent any secret leaks through source code repositories. + - *Is authentication to services possible without having credentials as a part of code?* + +- Securing the software supply chain + - *Is it possible to track Common Vulnerabilities and Exposures (CVEs) within utilized package dependencies?* + - *Is there an automated process for updating package dependencies?* + +- Data protection key lifecycles + - *Can service-managed keys be used for data integrity protection?* + - If customer-managed keys are required, secure and reliable key lifecycle must be managed, opening up to a variety of additional risks. + +- CI/CD tooling will require Azure AD service principals with sufficient subscription level access to facilitate control plane access for Azure resource deployments to all considered environment subscriptions. + - When application resources are locked down within private networks, a private data-plane connectivity path is required so that CI/CD tooling can perform application level deployments and maintenance. + - This introduces additional complexity and requires a sequence within the deployment process through requisite private build agents. + +### Design recommendations + +- Use Azure Policy to enforce security and reliability configurations for all service, ensuring that any deviation is either remediated or prohibited by the control plane at configuration-time, helping to mitigate threats associated with 'malicious admin' scenarios. + +- Use Azure AD Privileged Identity Management (PIM) within production subscriptions to revoke sustained control plane access to AlwaysOn production environments, significantly reducing the risk posed from 'malicious admin' scenarios through additional 'checks and balances'. + +- Use [Azure Managed Identities](https://docs.microsoft.com/azure/active-directory/managed-identities-azure-resources/overview) for all services that support the capability, since it facilitates the removal of credentials from application code and removes the operational burden of identity management for service to service communication. + +- Use Azure AD Role Based Access Control (RBAC) for data plane authorization with all services that support the capability. + +- Use first-party [Microsoft identity platform authentication libraries](https://docs.microsoft.com/azure/active-directory/develop/reference-v2-libraries) within application code to integrate with Azure AD. + +- Consider secure token caching since this will allow for a degraded but available experience in the event that the Azure AD, or the chosen identity platform, is not available or is only partially available for application authorization. + - Depending on what services provided by the identity service don't work. + - The identity provider can be down completely, therefore no authentication and authorization can be done. That's quite rare though. + - If the provider is unable to issue new access tokens, but still validates existing ones, the application and dependent services can operate without issues until their tokens expire. + - Token caching is typically handled automatically by authentication libraries ([such as MSAL](https://docs.microsoft.com/azure/active-directory/fundamentals/resilience-client-app?tabs=csharp)). + +- Use the principle of IaC and automated CI/CD pipelines to drive updates to all application components, including under failure circumstances. + - Ensure CI/CD tooling service connections are safeguarded as critical sensitive information, and should not be directly available to any service team. + - Apply granular RBAC to production CD pipelines to mitigate 'malicious admin' risks. + - Consider the use of manual approval gates within production deployment pipelines to further mitigate 'malicious admin' risks and provide additional technical assurance for all production changes. + - Additional security gates may come at a trade-off in terms of agility and should be carefully evaluated, with consideration given to how agility can be maintained even with manual gates. + +- Define an appropriate security posture for all lower environments to ensure key vulnerabilities are mitigated. + - Do not apply the same security posture as production, particularly with regards to data exfiltration, unless regulatory requirements stipulate the need to do so, since this will significantly compromise developer agility. + +- Enable Microsoft Defender for Cloud (formerly known as Azure Security Center) for all AlwaysOn subscriptions. + - Use Azure Policy to enable Azure Censure compliance. + - Enable Azure Defender for all services that support the capability in the AlwaysOn subscriptions. + +- Embrace [DevSecOps](https://docs.microsoft.com/azure/devops/devsecops/) and implement security testing within CI/CD pipelines. + - Test results should be measured against a compliant security posture to inform release approvals, be they automated or manual. + - Apply security testing as part of the CD production process for each release. + - If security testing each release jeopardizes operational agility, ensure a suitable security testing cadence is applied. + +- Limit public network access to the absolute minimum required for the application to fulfil its business purpose to reduce the external attack surface. + - Use [Azure Private Link](https://docs.microsoft.com/azure/private-link/private-endpoint-overview#private-link-resource) to establish [private endpoints](https://docs.microsoft.com/azure/private-link/private-endpoint-overview) for Azure resources which require secure network integration. + - Use a secure network path and [hosted private build agents](https://docs.microsoft.com/azure/devops/pipelines/agents/agents#install) for CI/CD tooling to deploy and configure Azure resources protected by Azure Private Link. + - [Microsoft-hosted agents](https://docs.microsoft.com/azure/devops/pipelines/agents/agents#microsoft-hosted-agents) will not be able to directly connect to network integrated resources. + +- Enable [secret scanning](https://github.blog/2020-08-27-secure-at-every-step-putting-devsecops-into-practice-with-code-scanning/) and dependency scanning within the source code repository. + +## Threat modeling + +Threat modeling provides a risk based approach to security design, using identified potential threats to develop appropriate security mitigations. There is ultimately a myriad of possible threats with varying probabilities of occurrence, and in many cases threats can chain in unexpected, unpredictable, and even chaotic ways. This complexity and uncertainty is precisely why traditional technology requirement based security approaches are largely unsuitable for mission-critical cloud applications, and unfortunately means that the process of threat modelling for an AlwaysOn application is complex and unyielding. + +To help navigate these challenges, a layered defense-in-depth approach should be applied to define and implement compensating mitigations for modeled threats, considering the following defensive layers. + +1. The Azure platform with foundational security capabilities and controls. +1. The AlwaysOn application architecture and security design. +1. Security features (built-in, enabled, and deployable) applied to secure Azure resources. +1. Application code and security logic. +1. Operational processes and DevSecOps. + +> In an Enterprise-Scale context, the foundational platform provides an additional threat mitigation layer through the provision of centralized security capabilities within the Enterprise-Scale architecture. + +### Design considerations + +- [STRIDE](https://en.wikipedia.org/wiki/STRIDE_(security)) provides a lightweight risk framework for evaluating security threats across key threat vectors. + - Spoofed Identity: Impersonation of individuals with authority. For example, an attacker impersonating another user by leveraging their - + - Identity + - Authentication + - Tampering Input: Modification of input sent to the application, or the breach of trust boundaries to modify application code. For example, an attacker using SQL Injection to delete data in a database table. + - Data integrity + - Validation + - Blacklisting/allowlisting + - Repudiation of Action: Ability to refute actions already taken, and the ability of the application to gather evidence and drive accountability. For example, the deletion of critical data without the ability to trace to a malicious admin. + - Audit/logging + - Signing + - Information Disclosure: Gaining access to restricted information. An example would be an attacker gaining access to a restricted file. + - Encryption + - Data exfiltration + - Man-in-the-middle attacks + - Denial of Service: Malicious application disruption to degrade user experience. For example, a DDoS botnet attack such as Slowloris. + - DDoS + - Botnets + - CDN and WAF capabilities + - Elevation of Privilege: Gaining privileged application access through authorization exploits. For example, an attacker manipulating a URL string to gain access to sensitive information. + - Remote code execution + - Authorization + - Isolation + +### Design recommendations + +- Allocate engineering budget within each sprint to evaluate potential new threats and implement mitigations. + +- Conscious effort should be applied to ensure security mitigations are captured within a common engineering criteria to drive consistency across all application service teams. + +- Start with a service by service level threat modeling and unify the model by consolidating the thread model on application level. + +## Network intrusion protection + +Preventing unauthorized access to an AlwaysOn application and encompassed data is vital to maintain availability and safeguard data integrity. This section will therefore explore the platform capabilities required to secure network access to an AlwaysOn application. + +### Design considerations + +- The zero trust model assumes a breached state and verifies each request as though it originates from an uncontrolled network. + - An advanced zero-trust network implementation employs micro-segmentation and distributed ingress/egress micro-perimeters. + +- Azure PaaS services such as AKS or Cosmos DB are typically accessed over public endpoints. However, the Azure platform provides capabilities to secure public endpoints or even make them entirely private. + - Azure Private Link/Private Endpoints provides dedicated access to an Azure PaaS resource using private IP addresses and private network connectivity. + - Virtual Network Service Endpoints provide service-level access from selected subnets to selected PaaS services. + - Virtual Network Injection provides dedicated private deployments for supported services, such as App Service through an App Service Environment. + - Management plane traffic still flows through public IP addresses. + +- For supported services, Azure Private Link using Azure Private Endpoints addresses [data exfiltration risks associated with Service Endpoints](https://docs.microsoft.com/azure/private-link/private-link-faq#what-is-the-difference-between-service-endpoints-and-private-endpoints-), such as a malicious admin writing data to an external resource. + +- When restricting network access to Azure PaaS services using Private Endpoints or Service Endpoints, a secure network channel will be required for deployment pipelines to access both the Azure control plane and data plane of Azure resources in order to deploy and manage the application. + - [Private self-hosted build agents](https://docs.microsoft.com/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) deployed onto the a private network as the Azure resource can be used as a proxy to execute CI/CD functions over a private connection. A separate virtual network should be used for build agents. + - Connectivity to the private build agents from CI/CD tooling is required. + - An alternative approach is to modify the firewall rules for the resource on-the-fly within the pipeline to allow a connection from an Azure DevOps agent public IP address, with the firewall subsequently removed after the task is completed. + - However, this approach is only applicable for a subset of Azure services. For example, this is not feasible for private AKS clusters. + - To perform developer and administrative tasks on the application service jump servers can be used. + +- The completion of administration and maintenance tasks is a further scenario requiring connectivity to the data plane of Azure resources. + +- Service Connections with a corresponding Azure AD service principals can be leveraged within Azure DevOps to apply RBAC through Azure AD. + +- Service Tags can be applied to Network Security Groups to facilitate connectivity with Azure PaaS services. + +- Application Security Groups do not span across multiple virtual networks. + +- Packet capture in Azure Network Watcher is limited to a maximum period of five hours. + +### Design recommendations + +- To maximize network security, limit network access to the absolute minimum required for the application to fulfil its purpose. + - Use internal network paths, such as Azure Bastion, or automatically managed temporary firewall rules to enable direct data-plane infrastructure access. + +- When dealing with private build agents, never open an RDP or SSH port directly to the internet. + - Use [Azure Bastion](https://docs.microsoft.com/azure/bastion/bastion-overview) to provide secure access to Azure Virtual Machines and to perform administrative tasks on Azure PaaS over the Internet. + +- Use a DDoS standard protection plan to secure all public IP addresses within the application. + +- Use Azure Front Door with WAF policies to deliver and help protect global HTTP/S AlwaysOn applications that span multiple Azure regions. + - Use Header Id validation to lock down public application endpoints so they only accept traffic originating from the Azure Front Door instance. + +- If additional in-line network security requirements, such as deep packet inspection or TLS inspection, mandate the use of Azure Firewall Premium or Network Virtual Appliance (NVA), ensure it is configured for maximum high availability and redundancy. + +- If packet capture requirements exist, use Network Watcher packets to capture despite the limited capture window. + +- Use Network Security Groups and Application Security Groups to micro-segment application traffic. + - Avoid using a security appliance to filter intra-application traffic flows. + - Consider the use of Azure Policy to enforce specific NSG rules are always associated with application subnets. + +- Enable NSG flow logs and feed them into Traffic Analytics to gain insights into internal and external traffic flows. + +- Use Azure Private Link/Private Endpoints, where available, to secure access to Azure PaaS services within the AlwaysOn application design, such as AKS, Cosmos DB, Azure Key Vault, Azure Container Registry, and Azure Storage. + +- If Private Endpoint is not available and data exfiltration risks are acceptable, use Virtual Network Service Endpoints to secure access to Azure PaaS services from within a virtual network. + - Don't enable virtual network service endpoints by default on all subnets as this will introduce significant data exfiltration channels. + +- For hybrid application scenarios, access Azure PaaS services from on-premises via ExpressRoute with private peering. + +> In an Enterprise-Scale context, the foundational platform will provide network connectivity to on-premises data centers using Express Route configured with private peering. + +## Data integrity protection + +Encryption is a vital step toward ensuring data integrity and is ultimately one of the most important security capabilities which can be applied to mitigate a wide array of threats. This section will therefore provide key considerations and recommendations related to encryption and key management in order to safeguard data without compromising application reliability. + +### Design considerations + +- Azure Key Vault has transaction limits for keys and secrets, with throttling applied per vault within a certain period. + +- Azure Key Vault provides a security boundary since access permissions for keys, secrets, and certificates are applied at a vault level. + - Key Vault access policy assignments grant permissions separately to keys, secrets, or certificates. + - Granular [object-level permissions](https://docs.microsoft.com/azure/key-vault/general/rbac-guide?tabs=azure-cli#best-practices-for-individual-keys-secrets-and-certificates) to a specific key, secret, or certificate are now possible. + +- Role assignments innur a latency, taking up to 10 minutes (600 seconds) for a role to be applied after a role assignment is changed. + - There is a AAD limit of 2,000 Azure role assignments per subscription. + +- Azure Key Vault underlying hardware security modules (HSMs) are FIPS 140-2 Level 2 compliant. + - A dedicated [Azure Key Vault managed HSM](https://docs.microsoft.com/azure/key-vault/managed-hsm/overview) is available for scenarios requiring FIPS 140-2 Level 3 compliance. + +- Azure Key Vault provides high availability and redundancy to help maintain availability and prevent data loss. + +- In the event of a region failover, it may take a few minutes for the Key Vault service to fail over. + - During a failover Key Vault will be in a read-only mode, so it will not be possible to change key vault properties such as firewall configurations and settings. + +- If private link is used to connect to Azure Key Vault, it may take up to 20 minutes for the connection to be re-established in the event of a regional failover. + +- A backup creates a [point-in-time snapshot](https://docs.microsoft.com/azure/key-vault/general/backup?tabs=azure-cli#overview) of a secret, key, or certificate, as an encrypted blob which cannot be decrypted outside of Azure. To get usable data from the blob, it must be restored into a Key Vault within the same Azure subscription and Azure geography. + - Secrets may renew during a backup, causing a mismatch. + +- With service-managed keys, Azure will perform key management functions, such as rotation, thereby reducing the scope of application operations. + +- Regulatory controls may stipulate the use of customer-managed keys for service encryption functionality. + +- When traffic moves between Azure data centers, MACsec data-link layer encryption is used on the underlying network hardware to secure data in-transit outside of the physical boundaries not controlled by Microsoft or on behalf of Microsoft. + +### Design recommendations + +- Use Azure Key Vault to store all application secrets and certificates. + - Deploy a separate collocated Azure Key Vault with every regional deployment stamp. + +- Use service-managed keys for data protection where possible, removing the need to manage encryption keys and handle operational tasks such as key rotation. + - Only use customer-managed keys when there is a clear regulatory requirement to do so. + +- Use [Azure Key Vault](https://docs.microsoft.com/azure/key-vault/general/overview) as a secure repository for all secrets, certificates, and keys if additional encryption mechanisms or customer-managed keys need considered. + - Provision Azure Key Vault with the soft delete and purge policies enabled to allow retention protection for deleted objects. + - Use HSM backed Azure Key Vault SKU for application production environments. + +- Deploy a separate Azure Key Vault instance within each regional deployment stamp, providing fault isolation and performance benefits through localization, as well as navigating the scale limits imposed by a single key vault instance. + - Use a dedicated Azure Key Vault instance for AlwaysOn global resources. + +- Follow a least privilege model by limiting authorization to permanently delete secrets, keys, and certificates to specialized custom Azure AD roles. + +- Ensure secrets, keys, and certificates stored within key vault are backed up, providing an offline copy in the unlikely event key vault becomes unavailable. + +- Use Key Vault certificates to [manage certificate procurement and signing](https://docs.microsoft.com/azure/key-vault/certificates/certificate-scenarios#creating-a-certificate-with-a-ca-partnered-with-key-vault). + +- Establish an automated process for key and certificate rotation. + - Automate the certificate management and renewal process with public certificate authorities to ease administration. + - Set alerting and notifications, to supplement automated certificate renewals. + +- Monitor key, certificate, and secret usage. + - Define [alerts](https://docs.microsoft.com/azure/key-vault/general/alert) for unexpected usage within Azure Monitor. + +- Enable firewall and Private Endpoints on all application Azure Key Vault to control access. + +## Policy driven governance + +Security conventions are ultimately only effective if consistently and holistically enforced across all application services and teams. Azure Policy provides a framework to enforce security and reliability baselines, ensuring continued compliance with a common engineering criteria for an AlwaysOn application. More specifically, Azure Policy forms a key part of the Azure Resource Manager (ARM) control plane, supplementing RBAC by restricting what actions authorized users can perform, and can be leveraged to enforce vital security and reliability conventions across utilized platform services. + +This section will therefore explore key considerations and recommendations surrounding the use of Azure Policy driven governance for an AlwaysOn application, ensuring security and reliability conventions are continuously enforced. + +### Design considerations + +- Azure Policy provides a mechanism to drive compliance by enforcing security and reliability conventions, such as the use of Private Endpoints or the use of Availability Zones. + +> In the context of an Enterprise Scale environment, the enforcement of centralized baseline policy assignments will likely be applied for Landing Zone management groups and subscriptions. + +- Azure Policy can be used to drive automated management activities, such as provisioning and configuration. + - Resource Provider registration. + - Validation and approval of individual Azure resource configurations. + +- Azure Policy assignment scope dictates coverage and the location of Azure Policy definitions informs the reusability of custom policies. + +- Azure Policy has [several limits](https://docs.microsoft.com/azure/azure-resource-manager/management/azure-subscription-service-limits#azure-policy-limits), such as the number of definitions at any particular scope. + +- It can take several minutes for the execution of Deploy If Not Exist (DINE) policies to occur. + +- Azure Policy provides a critical input for compliance reporting and security auditing. + +### Design recommendations + +- Map regulatory and compliance requirements to Azure Policy definitions. + - For example, if there are data residency requirements, a policy should be applied to restrict available deployment regions. + +- Define a common engineering criteria to capture secure and reliable configuration definitions for all utilized Azure services, ensuring this criteria is mapped to Azure Policy assignments to enforce compliance. + - For example, apply a Azure Policy to enforce the use of Availability Zones for all relevant services, ensuring reliable intra-region deployment configurations. + +> The AlwaysOn Foundational Reference Implementation contains a wide array of [security and reliability centric policies](https://github.com/Azure/AlwaysOn/blob/main/docs/reference-implementation/Policy-Driven-Governance.md) to define and enforce a sample common engineering criteria. + +- Monitor service configuration drift, relative to the common engineering criteria, using Azure Policy. + +> For AlwaysOn scenarios with multiple production subscriptions under a dedicated management group, prioritize assignments at the management group scope. + +- Use built-in policies where possible to minimize operational overhead of maintaining custom policy definitions. + +- Where custom policy definitions are required, ensure definitions are deployed at suitable management group scope to allow for reuse across encompassed AlwaysOn environment subscriptions to this allow for policy reuse across production and lower environments. + - When aligning the application roadmap with Azure roadmaps, leverage available Microsoft resources to explore if critical custom definitions could be incorporated as built-in definitions. + +> When deployed within an Enterprise Scale context, consider deploying custom Azure Policy Definitions within the intermediate company root management group scope to enable reuse across all applications within the broader Azure estate. + +> When deployed in an Enterprise Scale environment, certain centralized security policies will be applied by default within higher management group scopes to enforce security compliance across the entire Azure estate. For example, Azure policies should be applied to automatically deploy software configurations through VM extensions and enforce a compliant baseline VM configuration as part of the Enterprise-Scale foundation. + +- Use Azure Policy to enforce a consistent tagging schema across the application. + - Identify required Azure tags and leverage the append policy mode to enforce usage. + +> If the AlwaysOn application is subscribed to Microsoft Mission-Critical Support, ensure that the applied tagging schema provides meaningful context to enrichen the support experience with deep application understanding. + +- Export Azure AD activity logs to the AlwaysOn global Log Analytics Workspace. + - Ensure Azure activity logs are archived within the global Storage Account along with operational data for long-term retention. + +> In an Enterprise-Scale context, Azure AD activity logs will also be ingested into the centralized platform Log Analytics workspace. It needs to be evaluated in this case if Azure AD are still required in the AlwaysOn global Log Analytics workspace. + +- Integrate security information and event management with Microsoft Defender for Cloud (formerly known as Azure Security Center). + +--- + +|Previous Page|Next Page| +|:--|:--| +|[Networking and Connectivity](./Networking.md) | [Operational Procedures](./Operational-Procedures.md) | + +--- + +|Design Methodology| +|--| +|[How to use the AlwaysOn Design Methodology](./README.md) +|[AlwaysOn Design Principles](./Principles.md) +|[AlwaysOn Design Areas](./Design-Areas.md) +|[Application Design](./App-Design.md) +|[Application Platform](./App-Platform.md) +|[Data Platform](./Data-Platform.md) +|[Health Modeling and Observability](./Health-Modeling.md) +|[Deployment and Testing](./Deployment-Testing.md) +|[Networking and Connectivity](./Networking.md) +|[Security](./Security.md) +|[Operational Procedures](./Operational-Procedures.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/introduction/README.md b/docs/introduction/README.md new file mode 100644 index 00000000..37b5a3b6 --- /dev/null +++ b/docs/introduction/README.md @@ -0,0 +1,47 @@ +# AlwaysOn Introduction + +Getting started on Microsoft Azure is now easier than ever, however, building mission-critical solutions that are highly reliable on the platform remains a challenge for three main reasons: + +- Designing a reliable application at scale is complex and requires extensive platform knowledge to select the right technologies and optimally configure them as an end-to-end solution. + +- Failure is inevitable in any complex distributed system, and the solution must therefore be architected to handle failures and correlated or cascading impact, which is a change in mindset for many developers and architects entering the cloud from an on-premises environment; reliability engineering is no longer an infrastructure topic, but should be a first-class concern within the application development process. + +- Operationalizing mission-critical applications requires a high degree of engineering rigor and maturity throughout the end-to-end engineering lifecycle as well as the ability to learn from failure. + +AlwaysOn strives to address the challenge of building mission-critical applications on Azure, leveraging lessons from numerous customer applications and first-party solutions, such as Xbox Live, to provide actionable and authoritative guidance that applies [Well-Architected](https://docs.microsoft.com/azure/architecture/framework/) best practices as the technical foundation for building and operating a highly reliable solution on Azure at-scale. + +More specifically, AlwaysOn provides a design methodology to guide readers through the design process of building a highly reliable cloud-native application on Azure, explaining key design considerations and requisite design decisions along with associated trade-offs. Additionally, AlwaysOn provides a gallery of fully functional production-ready reference implementations aligned to common industry scenarios, which can serve as a basis for further solution development. + +## What is AlwaysOn? + +AlwaysOn is an open source architectural approach to building highly-reliable cloud-native applications on Microsoft Azure for mission-critical applications. + +The 'AlwaysOn' project name refers to the highly-reliable and mission-critical nature of the architectural pattern it represents, where for given set of business requirements, an application should always be operational and available. + +Because of the focus on reliability, the AlwaysOn design methodology presented within this section of Azure Architecture Center adopts a globally distributed and highly scalable approach to building applications on Azure. However, this globally distributed approach to achieve high reliability comes at a development cost which may not be justifiable for every workload scenario. It is therefore strongly advocated that design decisions are driven by business requirements but informed by the opinionated guidance provided within this section. + +## What Problem Does AlwaysOn Solve? + +Building mission-critical applications on any hyper-scale cloud platform requires significant technical expertise and engineering investment to appropriately select and piece together services and features. This complexity often leads to a sub-optimal solution, particularly given the typical prioritization of business needs over platform fundamentals and the struggle of aligning with evolving best practices. + +The AlwaysOn project strives to address this complex consumption experience for Microsoft Azure, by applying [Well-Architected](https://docs.microsoft.com/azure/architecture/framework/) best practices to mission-critical application scenarios, providing prescriptive and opinionated technical guidance alongside streamlined consumption mechanisms for common industry patterns through reference implementations; turn-key solutions that are implicitly aligned with Well-Architected best practices. + +## What Does AlwaysOn Provide? + +1. **Architectural Guidelines**: cloud-native design methodology to guide readers through the architectural process of building a mature mission-critical application on Microsoft Azure, articulating key design considerations and requisite design decisions along with associated trade-offs. + +2. **Fully Functional Reference Implementations**: end-to-end reference implementations intended to provide a solution orientated basis to showcase mission-critical application development on Microsoft Azure, leveraging Azure-native platform capabilities to maximize reliability and operational effectiveness. + - Design and implementation guidance to help readers understand and use the AlwaysOn design methodology in the context of a particular scenario. + - Production-ready technical artifacts including Infrastructure-as-Code (IaC) resources and Continuous-Integration/Continuous-Deployment (CI/CD) pipelines (GitHub and Azure DevOps) to deploy an AlwaysOn application with mature end-to-end operational wrappers. + +*Important Note: AlwaysOn will continue to develop additional reference implementations for common industry scenarios, with several implementations currently under development.* + +--- + +|Previous Page|Next Page| +|--|--| +|[Home](/README.md)|[How to use the AlwaysOn Design Methodology](../design-methodology/README.md) + +--- + +[AlwaysOn | Documentation Inventory](/docs/README.md) diff --git a/docs/media/AlwaysOn-SLO.gif b/docs/media/AlwaysOn-SLO.gif new file mode 100644 index 00000000..a141a105 Binary files /dev/null and b/docs/media/AlwaysOn-SLO.gif differ diff --git a/docs/media/alwayson-aiops-methodology.png b/docs/media/alwayson-aiops-methodology.png new file mode 100644 index 00000000..a200bfee Binary files /dev/null and b/docs/media/alwayson-aiops-methodology.png differ diff --git a/docs/media/alwayson-architecture-foundational-online.png b/docs/media/alwayson-architecture-foundational-online.png new file mode 100644 index 00000000..333fa7ac Binary files /dev/null and b/docs/media/alwayson-architecture-foundational-online.png differ diff --git a/docs/media/alwayson-asynchronous-communication.png b/docs/media/alwayson-asynchronous-communication.png new file mode 100644 index 00000000..1d54b1f5 Binary files /dev/null and b/docs/media/alwayson-asynchronous-communication.png differ diff --git a/docs/media/alwayson-design-areas.png b/docs/media/alwayson-design-areas.png new file mode 100644 index 00000000..7560bada Binary files /dev/null and b/docs/media/alwayson-design-areas.png differ diff --git a/docs/media/alwayson-design-principles-alt.png b/docs/media/alwayson-design-principles-alt.png new file mode 100644 index 00000000..4252a510 Binary files /dev/null and b/docs/media/alwayson-design-principles-alt.png differ diff --git a/docs/media/alwayson-design-principles.png b/docs/media/alwayson-design-principles.png new file mode 100644 index 00000000..0147d626 Binary files /dev/null and b/docs/media/alwayson-design-principles.png differ diff --git a/docs/media/alwayson-example-fault-states.png b/docs/media/alwayson-example-fault-states.png new file mode 100644 index 00000000..e38fde9c Binary files /dev/null and b/docs/media/alwayson-example-fault-states.png differ diff --git a/docs/media/alwayson-example-health-definitions.png b/docs/media/alwayson-example-health-definitions.png new file mode 100644 index 00000000..42cac99c Binary files /dev/null and b/docs/media/alwayson-example-health-definitions.png differ diff --git a/docs/media/alwayson-global-distribution.gif b/docs/media/alwayson-global-distribution.gif new file mode 100644 index 00000000..5f761260 Binary files /dev/null and b/docs/media/alwayson-global-distribution.gif differ diff --git a/docs/media/alwayson-global-routing.gif b/docs/media/alwayson-global-routing.gif new file mode 100644 index 00000000..72242c7a Binary files /dev/null and b/docs/media/alwayson-global-routing.gif differ diff --git a/docs/media/alwayson-health-data-collection.png b/docs/media/alwayson-health-data-collection.png new file mode 100644 index 00000000..86fb6bc3 Binary files /dev/null and b/docs/media/alwayson-health-data-collection.png differ diff --git a/docs/media/alwayson-high-level-architecture.png b/docs/media/alwayson-high-level-architecture.png new file mode 100644 index 00000000..40607603 Binary files /dev/null and b/docs/media/alwayson-high-level-architecture.png differ diff --git a/docs/media/alwayson-landing-zones.gif b/docs/media/alwayson-landing-zones.gif new file mode 100644 index 00000000..878195cf Binary files /dev/null and b/docs/media/alwayson-landing-zones.gif differ diff --git a/docs/media/alwayson-reliability-tiers.png b/docs/media/alwayson-reliability-tiers.png new file mode 100644 index 00000000..7f7f0880 Binary files /dev/null and b/docs/media/alwayson-reliability-tiers.png differ diff --git a/docs/media/alwayson-repo-structure.png b/docs/media/alwayson-repo-structure.png new file mode 100644 index 00000000..345596f5 Binary files /dev/null and b/docs/media/alwayson-repo-structure.png differ diff --git a/docs/media/alwayson-scale-units.png b/docs/media/alwayson-scale-units.png new file mode 100644 index 00000000..e8398a51 Binary files /dev/null and b/docs/media/alwayson-scale-units.png differ diff --git a/docs/media/alwayson-subscription-organization.png b/docs/media/alwayson-subscription-organization.png new file mode 100644 index 00000000..dfc0cee8 Binary files /dev/null and b/docs/media/alwayson-subscription-organization.png differ diff --git a/docs/media/alwayson-subscription-scale.gif b/docs/media/alwayson-subscription-scale.gif new file mode 100644 index 00000000..84a61c69 Binary files /dev/null and b/docs/media/alwayson-subscription-scale.gif differ diff --git a/docs/media/alwayson-zero-downtime-pipeline.png b/docs/media/alwayson-zero-downtime-pipeline.png new file mode 100644 index 00000000..7354f666 Binary files /dev/null and b/docs/media/alwayson-zero-downtime-pipeline.png differ diff --git a/icon.png b/icon.png new file mode 100644 index 00000000..4ef29fa9 Binary files /dev/null and b/icon.png differ