Skip to content
This repository has been archived by the owner on Oct 12, 2023. It is now read-only.

Commit

Permalink
Feature: Better configuration (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
timotheeguerin authored Mar 6, 2019
1 parent 3c5a89c commit bc81529
Show file tree
Hide file tree
Showing 24 changed files with 533 additions and 125 deletions.
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,21 @@ For example:
Add this to your start task

```bash
# For version 1.x of batch insights
/bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/1.x/run-linux.sh | bash'

# For latest version of batch insights
/bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/run-linux.sh | bash'
```

### Windows

Add this to your start task
```powershell
# For version 1.x of batch insights
cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/1.x/run-windows.ps1'))"
# For latest version of batch insights
cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/run-windows.ps1'))"
```
Expand Down Expand Up @@ -81,12 +89,15 @@ You can add this to your main script:
pip install psutil python-dateutil applicationinsights==0.11.3
wget --no-cache https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py
python --version
python nodestats.py > node-stats.log 2>&1 &
python nodestats.py > batch-insights.log 2>&1 &
```

## Monitoring processes
## Configuration

[See available configuration options](./docs/configuration.md)

If you want to emit data for processes, you can set the `AZ_BATCH_MONITOR_PROCESSES` environment variable. It can be set to a comma-separated list of process names that should be monitored (note that on Windows the .exe suffix must be included: `notepad.exe,explorer.exe`).
You can set the `AZ_BATCH_INSIGHTS_ARGS` environemnt variable to pass parameters to the tool.
e.g. `AZ_BATCH_INSIGHTS_ARGS` > `--disable networkIO --aggregation 5`

## View data

Expand Down
2 changes: 1 addition & 1 deletion centos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ pip --version
pip install psutil python-dateutil applicationinsights==0.11.3

wget --no-cache https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py
python nodestats.py > node-stats.log 2>&1 &
python nodestats.py > batch-insights.log 2>&1 &
30 changes: 30 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# BatchInsights Configuration

Batch Insights provides various configuration option(Version `1.2.0` and above).


#### `--poolID <value>`
Pool ID. Override pool ID provided by the `AZ_BATCH_POOL_ID` environment variable
#### `--nodeID <value>`
Node ID. Override node ID provided by the `AZ_BATCH_NODE_ID` environment variable
#### `--instKey <value>`
Instrumentation key. Application Insights instrumentation key to emit the metrics
#### `--disable <value>`
Comma separated list of metrics to disable. e.g. `--disable networkIO,diskUsage`

Available metrics names:
- diskIO
- diskUsage
- networkIO
- memory
- CPU
- GPU

* `--aggregation <value>` Number in minutes to aggregate the data locally. Defaults to 1 minute

Example: `--agregation 5` to aggregate for 5 minutes

#### `--processes <value>`
Comma separated list of processes to monitor.

Example: `--processes notepad.exe,explorer.exe`
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ require (
github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d
github.com/go-ole/go-ole v1.2.1 // indirect
github.com/mattn/go-colorable v0.1.1
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732
github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d
github.com/pkg/errors v0.8.0
github.com/satori/go.uuid v1.2.0 // indirect
github.com/shirou/gopsutil v2.18.12+incompatible
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
github.com/sirupsen/logrus v1.3.0
github.com/stretchr/testify v1.3.0
golang.org/x/sys v0.0.0-20180907202204-917fdcba135d // indirect
)
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@ github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f h1:5ZfJxyXo8KyX8
github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d h1:lDrio3iIdNb0Gw9CgH7cQF+iuB5mOOjdJ9ERNJCBgb4=
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/go-ole/go-ole v1.2.1 h1:2lOsA72HgjxAuMlKpFiCbHTvu44PIVkZ5hqm3RSdI/E=
github.com/go-ole/go-ole v1.2.1/go.mod h1:7FAglXiTm7HKlQRDeOQ6ZNUHidzCWXuZWq/1dTyBNF8=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/mattn/go-colorable v0.1.1 h1:G1f5SKeVxmagw/IyvzvtZE4Gybcc4Tr1tf7I8z0XgOg=
github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ=
github.com/mattn/go-isatty v0.0.5 h1:tHXDdz1cpzGaovsTB+TVB8q90WEokoVmfMqoVcrLUgw=
github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732 h1:Dl/79RxNt1t6AYIMhKzyofqooXgw6+LZtAN4EIXRLCk=
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY=
github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d h1:lQo1zUtnGr52K2a+Ll3DNDoukmPeuHK11baUNGRDSt0=
Expand All @@ -27,8 +35,16 @@ github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAri
github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180907202204-917fdcba135d h1:kWn1hlsqeUrk6JsLJO0ZFyz9bMg8u85voZlIuc68ZU4=
golang.org/x/sys v0.0.0-20180907202204-917fdcba135d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223 h1:DH4skfRX4EBpamg7iV4ZlCpblAHI6s6TDM39bFZumv8=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
104 changes: 81 additions & 23 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,48 +1,106 @@
package main

import (
"flag"
"fmt"
"github.com/Azure/batch-insights/pkg"
log "github.com/sirupsen/logrus"
"os"
"strings"
)

func parseListArgs(value string) []string {
names := strings.Split(value, ",")
for i := range names {
names[i] = strings.TrimSpace(names[i])
}
return names
}

func getenv(key string) *string {
value := os.Getenv(key)
if len(value) == 0 {
return nil
}
return &value
}

func initLogger() {
log.SetFormatter(&log.TextFormatter{
FullTimestamp: false,
DisableTimestamp: true,
ForceColors: true,
})
}

func main() {
var appInsightsKey = os.Getenv("APP_INSIGHTS_INSTRUMENTATION_KEY")
var poolId = os.Getenv("AZ_BATCH_POOL_ID")
var nodeId = os.Getenv("AZ_BATCH_NODE_ID")
var processNamesStr = os.Getenv("AZ_BATCH_MONITOR_PROCESSES")
initLogger()
disableArg := flag.String("disable", "", "List of metrics to disable")
processArg := flag.String("processes", "", "List of process name to watch")

if len(os.Args) > 2 {
poolId = os.Args[1]
nodeId = os.Args[2]
envConfig := batchinsights.UserConfig{
InstrumentationKey: getenv("APP_INSIGHTS_INSTRUMENTATION_KEY"),
PoolID: getenv("AZ_BATCH_POOL_ID"),
NodeID: getenv("AZ_BATCH_NODE_ID"),
}
processEnv := getenv("AZ_BATCH_MONITOR_PROCESSES")
if processEnv != nil {
envConfig.Processes = parseListArgs(*processEnv)
}
argsConfig := batchinsights.UserConfig{
PoolID: flag.String("poolID", "", "Batch pool ID"),
NodeID: flag.String("nodeID", "", "Batch node ID"),
Aggregation: flag.Int("aggregation", 1, "Aggregation in minutes"),
InstrumentationKey: flag.String("instKey", "", "Application Insights instrumentation KEY"),
}

version := flag.Bool("version", false, "Print current batch insights version")

if len(os.Args) > 3 {
appInsightsKey = os.Args[3]
flag.Parse()

if *version {
fmt.Println(batchinsights.Version)
os.Exit(0)
}

if len(os.Args) > 4 {
processNamesStr = os.Args[4]
if processArg != nil {
argsConfig.Processes = parseListArgs(*processArg)
}
if disableArg != nil {
argsConfig.Disable = parseListArgs(*disableArg)
}

processNames := strings.Split(processNamesStr, ",")
for i := range processNames {
processNames[i] = strings.TrimSpace(processNames[i])
config := envConfig.Merge(argsConfig)

positionalArgs := flag.Args()
if len(positionalArgs) > 0 {
log.Warn("Using postional arguments for Node ID, PoolID, KEY and Process names is deprecated. Use --poolID, --nodeID, --instKey, --process")
log.Warn("It will be removed in 2.0.0")
config.PoolID = &positionalArgs[0]
}

batchinsights.PrintSystemInfo()
fmt.Printf(" Pool ID: %s\n", poolId)
fmt.Printf(" Node ID: %s\n", nodeId)
if len(positionalArgs) > 1 {
config.NodeID = &positionalArgs[1]
}

hiddenKey := "-"
if appInsightsKey != "" {
hiddenKey = "xxxxx"
if len(positionalArgs) > 2 {
config.InstrumentationKey = &positionalArgs[2]
}

fmt.Printf(" Instrumentation Key: %s\n", hiddenKey)
if len(positionalArgs) > 3 {
config.Processes = parseListArgs(positionalArgs[3])
}

config.Print()

fmt.Printf(" Monitoring processes: %s\n", strings.Join(processNames, ", "))
computedConfig, err := batchinsights.ValidateAndBuildConfig(config)

batchinsights.ListenForStats(poolId, nodeId, appInsightsKey, processNames)
if err != nil {
log.Error("Invalid config", err)
os.Exit(2)
}

computedConfig.Print()
batchinsights.PrintSystemInfo()
batchinsights.ListenForStats(computedConfig)
}
59 changes: 31 additions & 28 deletions pkg/appinsights.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,24 @@ import (
"github.com/Microsoft/ApplicationInsights-Go/appinsights"
)

// AppInsightsService service handling the aggregation and upload of metrics
type AppInsightsService struct {
client appinsights.TelemetryClient
aggregation time.Duration
aggregateCollectionStart *time.Time
aggregates map[string]*appinsights.AggregateMetricTelemetry
}

const AGGREGATE_TIME = time.Duration(1) * time.Minute

func NewAppInsightsService(instrumentationKey string, poolId string, nodeId string) AppInsightsService {
// NewAppInsightsService create a new instance of the AppInsightsService
func NewAppInsightsService(instrumentationKey string, poolID string, nodeID string, aggregation time.Duration) AppInsightsService {
client := appinsights.NewTelemetryClient(instrumentationKey)
client.Context().Tags.Cloud().SetRole(poolId)
client.Context().Tags.Cloud().SetRoleInstance(nodeId)
client.Context().Tags.Cloud().SetRole(poolID)
client.Context().Tags.Cloud().SetRoleInstance(nodeID)

return AppInsightsService{
client: client,
aggregates: make(map[string]*appinsights.AggregateMetricTelemetry),
client: client,
aggregation: aggregation,
aggregates: make(map[string]*appinsights.AggregateMetricTelemetry),
}
}

Expand All @@ -34,9 +36,8 @@ func (service *AppInsightsService) track(metric *appinsights.MetricTelemetry) {
if service.aggregateCollectionStart != nil {
elapsed := t.Sub(*service.aggregateCollectionStart)

if elapsed > AGGREGATE_TIME {
for k, aggregate := range service.aggregates {
fmt.Printf(" - %s: %f\n", k, aggregate.Value)
if elapsed > service.aggregation {
for _, aggregate := range service.aggregates {
service.client.Track(aggregate)
}
service.aggregates = make(map[string]*appinsights.AggregateMetricTelemetry)
Expand All @@ -46,7 +47,7 @@ func (service *AppInsightsService) track(metric *appinsights.MetricTelemetry) {
service.aggregateCollectionStart = &t
}

id := GetMetricId(metric)
id := GetMetricID(metric)

aggregate, ok := service.aggregates[id]
if !ok {
Expand All @@ -57,17 +58,18 @@ func (service *AppInsightsService) track(metric *appinsights.MetricTelemetry) {
aggregate.AddData([]float64{metric.Value})
}

// UploadStats will register the given stats for upload. They will be first aggregated during the given aggregation interval
func (service *AppInsightsService) UploadStats(stats NodeStats) {
client := service.client

for cpuN, percent := range stats.cpuPercents {
for cpuN, percent := range stats.CPUPercents {
metric := appinsights.NewMetricTelemetry("Cpu usage", percent)
metric.Properties["CPU #"] = strconv.Itoa(cpuN)
metric.Properties["Core count"] = strconv.Itoa(len(stats.cpuPercents))
metric.Properties["Core count"] = strconv.Itoa(len(stats.CPUPercents))
service.track(metric)
}

for _, usage := range stats.diskUsage {
for _, usage := range stats.DiskUsage {
usedMetric := appinsights.NewMetricTelemetry("Disk usage", float64(usage.Used))
usedMetric.Properties["Disk"] = usage.Path
service.track(usedMetric)
Expand All @@ -76,22 +78,22 @@ func (service *AppInsightsService) UploadStats(stats NodeStats) {
service.track(freeMetric)
}

if stats.memory != nil {
service.track(appinsights.NewMetricTelemetry("Memory used", float64(stats.memory.Used)))
service.track(appinsights.NewMetricTelemetry("Memory available", float64(stats.memory.Total-stats.memory.Used)))
if stats.Memory != nil {
service.track(appinsights.NewMetricTelemetry("Memory used", float64(stats.Memory.Used)))
service.track(appinsights.NewMetricTelemetry("Memory available", float64(stats.Memory.Total-stats.Memory.Used)))
}
if stats.diskIO != nil {
service.track(appinsights.NewMetricTelemetry("Disk read", float64(stats.diskIO.ReadBps)))
service.track(appinsights.NewMetricTelemetry("Disk write", float64(stats.diskIO.WriteBps)))
if stats.DiskIO != nil {
service.track(appinsights.NewMetricTelemetry("Disk read", float64(stats.DiskIO.ReadBps)))
service.track(appinsights.NewMetricTelemetry("Disk write", float64(stats.DiskIO.WriteBps)))
}

if stats.netIO != nil {
service.track(appinsights.NewMetricTelemetry("Network read", float64(stats.netIO.ReadBps)))
service.track(appinsights.NewMetricTelemetry("Network write", float64(stats.netIO.WriteBps)))
if stats.NetIO != nil {
service.track(appinsights.NewMetricTelemetry("Network read", float64(stats.NetIO.ReadBps)))
service.track(appinsights.NewMetricTelemetry("Network write", float64(stats.NetIO.WriteBps)))
}

if len(stats.gpus) > 0 {
for cpuN, usage := range stats.gpus {
if len(stats.Gpus) > 0 {
for cpuN, usage := range stats.Gpus {
gpuMetric := appinsights.NewMetricTelemetry("Gpu usage", usage.GPU)
gpuMetric.Properties["GPU #"] = strconv.Itoa(cpuN)
service.track(gpuMetric)
Expand All @@ -102,8 +104,8 @@ func (service *AppInsightsService) UploadStats(stats NodeStats) {
}
}

if len(stats.processes) > 0 {
for _, processStats := range stats.processes {
if len(stats.Processes) > 0 {
for _, processStats := range stats.Processes {

pidStr := strconv.FormatInt(int64(processStats.pid), 10)

Expand All @@ -127,7 +129,8 @@ func (service *AppInsightsService) UploadStats(stats NodeStats) {
client.Channel().Flush()
}

func GetMetricId(metric *appinsights.MetricTelemetry) string {
// GetMetricID compute an group id for this metric so it can be aggregated
func GetMetricID(metric *appinsights.MetricTelemetry) string {
groupBy := createKeyValuePairs(metric.Properties)
return fmt.Sprintf("%s/%s", metric.Name, groupBy)
}
Expand Down
Loading

0 comments on commit bc81529

Please sign in to comment.