Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

profiler: goroutineswait saw-stop mechanism #942

Merged
merged 8 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 36 additions & 26 deletions profiler/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"unicode"
Expand Down Expand Up @@ -82,21 +83,22 @@ type config struct {
agentless bool
// targetURL is the upload destination URL. It will be set by the profiler on start to either apiURL or agentURL
// based on the other options.
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
mutexFraction int
blockRate int
outputDir string
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
maxGoroutinesWait int
mutexFraction int
blockRate int
outputDir string
}

func urlForSite(site string) (string, error) {
Expand Down Expand Up @@ -127,17 +129,18 @@ func (c *config) addProfileType(t ProfileType) {

func defaultConfig() (*config, error) {
c := config{
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
maxGoroutinesWait: 1000, // arbitrary value, should limit STW to ~30ms
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
}
for _, t := range defaultProfileTypes {
c.addProfileType(t)
Expand Down Expand Up @@ -206,6 +209,13 @@ func defaultConfig() (*config, error) {
if v := os.Getenv("DD_PROFILING_OUTPUT_DIR"); v != "" {
withOutputDir(v)(&c)
}
if v := os.Getenv("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"); v != "" {
n, err := strconv.Atoi(v)
if err != nil {
return nil, fmt.Errorf("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES: %s", err)
}
c.maxGoroutinesWait = n
}
return &c, nil
}

Expand Down
5 changes: 5 additions & 0 deletions profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"errors"
"fmt"
"io"
"runtime"
"runtime/pprof"
"time"

Expand Down Expand Up @@ -231,6 +232,10 @@ func goroutineProfile(cfg *config) (*profile, error) {
}

func goroutineWaitProfile(cfg *config) (*profile, error) {
if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait {
return nil, fmt.Errorf("skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait)
}

var (
text = &bytes.Buffer{}
pprof = &bytes.Buffer{}
Expand Down
48 changes: 48 additions & 0 deletions profiler/profile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ package profiler

import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"strconv"
"testing"
"time"

Expand Down Expand Up @@ -166,6 +169,51 @@ main.main()
"main.indirectShortSleepLoop2",
})
})

t.Run("goroutineswaitLimit", func(t *testing.T) {
// spawGoroutines spawns n goroutines and then returns a func to stop them.
spawnGoroutines := func(n int) func() {
launched := make(chan struct{})
stopped := make(chan struct{})
for i := 0; i < n; i++ {
go func() {
launched <- struct{}{}
stopped <- struct{}{}
}()
<-launched
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the point of launched? It looks to me like if you removed it, this would function the same way, since the goroutines would all block on stopped.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intend behind launched is to ensure that the goroutines have started executing before spawnGoroutines() returns. Without launched this invariant would be subject to a race condition.

In practice launched could be removed without repercussions because the go runtime immediately registers newly created goroutines as _Grunnable. That being said, this behavior seems to be an implementation detail that isn't covered by the language specification. I.e. a future version of Go might decide to put the goroutines into a new _Glaunchable state that isn't included in runtime.NumGoroutines(). So launched could prevent our test from breaking in the future.

That being said, I don't expect the runtime details of this to change anytime soon, so I don't feel strongly about it. I can remove the launched if you like.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation. That makes sense and seems to be a good solution.

My only comment is that it does not seem obvious why this code is like it is unless you know the go runtime internals. It might be good to make a small comment here about launched and how we need the goroutines to be running before spawnGoroutines returns.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, PTAL at 2e95053 to see if this does a good enough job in clarifying the intend.

}
return func() {
for i := 0; i < n; i++ {
<-stopped
}
}
}

goroutines := 100
limit := 10

stop := spawnGoroutines(goroutines)
defer stop()
envVar := "DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"
oldVal := os.Getenv(envVar)
os.Setenv(envVar, strconv.Itoa(limit))
defer os.Setenv(envVar, oldVal)

defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile)
lookupProfile = func(_ string, w io.Writer, _ int) error {
_, err := w.Write([]byte(""))
return err
}

p, err := unstartedProfiler()
require.NoError(t, err)
_, err = p.runProfile(expGoroutineWaitProfile)
var errRoutines, errLimit int
msg := "skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d"
fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit)
require.GreaterOrEqual(t, errRoutines, goroutines)
require.Equal(t, limit, errLimit)
})
}

func Test_goroutineDebug2ToPprof_CrashSafety(t *testing.T) {
Expand Down