Skip to content

Commit

Permalink
runtime: partition data and BSS root marking
Browse files Browse the repository at this point in the history
Currently data and BSS root marking are each a single markroot job.
This makes them difficult to load balance, which can draw out mark
termination time if they are large.

Fix this by splitting both in to 256K chunks. While we're putting in
the infrastructure for dynamic roots, we also replace the fixed
sharding of the span roots with sharding in to fixed sizes. In
addition to helping balance root marking, this also paves the way to
parallelizing concurrent scan and to letting assists help with root
marking.

Updates #10345. This fixes the data and BSS aspects of that bug; it
does not partition scanning of large heap objects.

This has negligible effect on either the go1 benchmarks or the garbage
benchmark:

name              old time/op  new time/op  delta
XBenchGarbage-12  4.90ms ± 1%  4.91ms ± 2%   ~     (p=0.058 n=17+16)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.11s ± 4%     3.12s ± 4%    ~     (p=0.512 n=20+20)
Fannkuch11-12                2.53s ± 2%     2.47s ± 2%  -2.28%  (p=0.000 n=20+18)
FmtFprintfEmpty-12          49.1ns ± 1%    50.0ns ± 4%  +1.68%  (p=0.008 n=18+20)
FmtFprintfString-12          170ns ± 0%     172ns ± 1%  +1.05%  (p=0.000 n=14+19)
FmtFprintfInt-12             174ns ± 1%     162ns ± 1%  -6.81%  (p=0.000 n=18+17)
FmtFprintfIntInt-12          284ns ± 1%     277ns ± 1%  -2.42%  (p=0.000 n=20+19)
FmtFprintfPrefixedInt-12     252ns ± 1%     244ns ± 1%  -2.84%  (p=0.000 n=18+20)
FmtFprintfFloat-12           317ns ± 0%     311ns ± 0%  -1.95%  (p=0.000 n=19+18)
FmtManyArgs-12              1.08µs ± 1%    1.11µs ± 1%  +3.43%  (p=0.000 n=18+19)
GobDecode-12                8.56ms ± 1%    8.61ms ± 1%  +0.50%  (p=0.020 n=20+20)
GobEncode-12                6.58ms ± 1%    6.57ms ± 1%    ~     (p=0.792 n=20+19)
Gzip-12                      317ms ± 3%     317ms ± 2%    ~     (p=0.840 n=19+19)
Gunzip-12                   41.6ms ± 0%    41.6ms ± 0%  +0.07%  (p=0.027 n=18+15)
HTTPClientServer-12         62.2µs ± 1%    62.3µs ± 1%    ~     (p=0.283 n=19+20)
JSONEncode-12               16.5ms ± 2%    16.5ms ± 1%    ~     (p=0.857 n=20+19)
JSONDecode-12               58.5ms ± 1%    61.3ms ± 1%  +4.67%  (p=0.000 n=18+17)
Mandelbrot200-12            3.84ms ± 0%    3.84ms ± 0%    ~     (p=0.259 n=17+17)
GoParse-12                  3.70ms ± 2%    3.74ms ± 2%  +0.96%  (p=0.009 n=19+20)
RegexpMatchEasy0_32-12       100ns ± 1%     100ns ± 0%  +0.31%  (p=0.040 n=19+15)
RegexpMatchEasy0_1K-12       340ns ± 1%     340ns ± 1%    ~     (p=0.411 n=17+19)
RegexpMatchEasy1_32-12      82.7ns ± 2%    82.3ns ± 1%    ~     (p=0.456 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     495ns ± 0%    ~     (p=0.108 n=19+17)
RegexpMatchMedium_32-12      130ns ± 1%     130ns ± 2%    ~     (p=0.405 n=18+19)
RegexpMatchMedium_1K-12     39.4µs ± 2%    39.1µs ± 1%  -0.64%  (p=0.002 n=20+19)
RegexpMatchHard_32-12       2.03µs ± 2%    2.02µs ± 0%    ~     (p=0.561 n=20+17)
RegexpMatchHard_1K-12       61.1µs ± 2%    60.8µs ± 1%    ~     (p=0.615 n=19+18)
Revcomp-12                   532ms ± 2%     531ms ± 1%    ~     (p=0.470 n=19+19)
Template-12                 68.5ms ± 1%    69.1ms ± 1%  +0.87%  (p=0.000 n=17+17)
TimeParse-12                 344ns ± 2%     344ns ± 1%  +0.25%  (p=0.032 n=19+18)
TimeFormat-12                347ns ± 1%     362ns ± 1%  +4.27%  (p=0.000 n=17+19)
[Geo mean]                  62.3µs         62.3µs       -0.04%

name                      old speed      new speed      delta
GobDecode-12              89.6MB/s ± 1%  89.2MB/s ± 1%  -0.50%  (p=0.019 n=20+20)
GobEncode-12               117MB/s ± 1%   117MB/s ± 1%    ~     (p=0.797 n=20+19)
Gzip-12                   61.3MB/s ± 3%  61.2MB/s ± 2%    ~     (p=0.834 n=19+19)
Gunzip-12                  467MB/s ± 0%   466MB/s ± 0%  -0.07%  (p=0.027 n=18+15)
JSONEncode-12              117MB/s ± 2%   117MB/s ± 1%    ~     (p=0.851 n=20+19)
JSONDecode-12             33.2MB/s ± 1%  31.7MB/s ± 1%  -4.47%  (p=0.000 n=18+17)
GoParse-12                15.6MB/s ± 2%  15.5MB/s ± 2%  -0.95%  (p=0.008 n=19+20)
RegexpMatchEasy0_32-12     321MB/s ± 2%   320MB/s ± 1%  -0.57%  (p=0.002 n=17+17)
RegexpMatchEasy0_1K-12    3.01GB/s ± 1%  3.01GB/s ± 1%    ~     (p=0.132 n=17+18)
RegexpMatchEasy1_32-12     387MB/s ± 2%   389MB/s ± 1%    ~     (p=0.423 n=20+19)
RegexpMatchEasy1_1K-12    2.05GB/s ± 2%  2.06GB/s ± 0%    ~     (p=0.129 n=19+17)
RegexpMatchMedium_32-12   7.64MB/s ± 1%  7.66MB/s ± 1%    ~     (p=0.258 n=18+19)
RegexpMatchMedium_1K-12   26.0MB/s ± 2%  26.2MB/s ± 1%  +0.64%  (p=0.002 n=20+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.8MB/s ± 1%    ~     (p=0.510 n=20+17)
RegexpMatchHard_1K-12     16.8MB/s ± 2%  16.8MB/s ± 1%    ~     (p=0.603 n=19+18)
Revcomp-12                 477MB/s ± 2%   479MB/s ± 1%    ~     (p=0.470 n=19+19)
Template-12               28.3MB/s ± 1%  28.1MB/s ± 1%  -0.85%  (p=0.000 n=17+17)
[Geo mean]                 100MB/s        100MB/s       -0.26%

Change-Id: Ib0bfe0145675ce88c5a8791752f7486ac98805b4
Reviewed-on: https://go-review.googlesource.com/16043
Reviewed-by: Rick Hudson <rlh@golang.org>
  • Loading branch information
aclements committed Oct 26, 2015
1 parent 0be3c40 commit d3df04c
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 38 deletions.
21 changes: 10 additions & 11 deletions src/runtime/mgc.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,6 @@ const (
_ConcurrentSweep = true
_FinBlockSize = 4 * 1024

_RootData = 0
_RootBss = 1
_RootFinalizers = 2
_RootFlushCaches = 3
_RootSpans0 = 4
_RootSpansShards = 128
_RootCount = _RootSpans0 + _RootSpansShards

// sweepMinHeapDistance is a lower bound on the heap distance
// (in bytes) reserved for concurrent sweeping between GC
// cycles. This will be scaled by gcpercent/100.
Expand Down Expand Up @@ -804,6 +796,9 @@ var work struct {
alldone note
markfor *parfor

// Number of roots of various root types. Set by gcMarkRootPrepare.
nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int

// finalizersDone indicates that finalizers and objects with
// finalizers have been scanned by markroot. During concurrent
// GC, this happens during the concurrent scan phase. During
Expand Down Expand Up @@ -1060,8 +1055,9 @@ func gc(mode gcMode) {
// barriers. Rescan some roots and flush work caches.
systemstack(func() {
// rescan global data and bss.
markroot(nil, _RootData)
markroot(nil, _RootBss)
for i := fixedRootCount; i < fixedRootCount+work.nDataRoots+work.nBSSRoots; i++ {
markroot(nil, uint32(i))
}

// Disallow caching workbufs.
gcBlackenPromptly = true
Expand Down Expand Up @@ -1460,6 +1456,9 @@ func gcMark(start_time int64) {
// but must be disposed to the global lists immediately.
gcFlushGCWork()

// Queue root marking jobs.
nRoots := gcMarkRootPrepare()

work.nwait = 0
work.ndone = 0
work.nproc = uint32(gcprocs())
Expand All @@ -1468,7 +1467,7 @@ func gcMark(start_time int64) {
traceGCScanStart()
}

parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), false, markroot)
parforsetup(work.markfor, work.nproc, uint32(nRoots), false, markroot)
if work.nproc > 1 {
noteclear(&work.alldone)
helpgc(int32(work.nproc))
Expand Down
136 changes: 109 additions & 27 deletions src/runtime/mgcmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,64 @@ package runtime

import "unsafe"

const (
fixedRootFinalizers = iota
fixedRootFlushCaches
fixedRootCount

// rootBlockBytes is the number of bytes to scan per data or
// BSS root.
rootBlockBytes = 256 << 10

// rootBlockSpans is the number of spans to scan per span
// root.
rootBlockSpans = 8 * 1024 // 64MB worth of spans
)

// gcMarkRootPrepare initializes scanning-related state and returns
// the number of roots.
//
// The caller must have call gcCopySpans().
//
//go:nowritebarrier
func gcMarkRootPrepare() int {
// Compute how many data and BSS root blocks there are.
nBlocks := func(bytes uintptr) int {
return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
}

work.nDataRoots = 0
for datap := &firstmoduledata; datap != nil; datap = datap.next {
nDataRoots := nBlocks(datap.edata - datap.data)
if nDataRoots > work.nDataRoots {
work.nDataRoots = nDataRoots
}
}

work.nBSSRoots = 0
for datap := &firstmoduledata; datap != nil; datap = datap.next {
nBSSRoots := nBlocks(datap.ebss - datap.bss)
if nBSSRoots > work.nBSSRoots {
work.nBSSRoots = nBSSRoots
}
}

// Compute number of span roots.
work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans

// Snapshot of allglen. During concurrent scan, we just need
// to be consistent about how many markroot jobs we create and
// how many Gs we check. Gs may be created after this point,
// but it's okay that we ignore them because they begin life
// without any roots, so there's nothing to scan, and any
// roots they create during the concurrent phase will be
// scanned during mark termination. During mark termination,
// allglen isn't changing, so we'll scan all Gs.
work.nStackRoots = int(atomicloaduintptr(&allglen))

return fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots
}

// Scan all of the stacks, greying (or graying if in America) the referents
// but not blackening them since the mark write barrier isn't installed.
//go:nowritebarrier
Expand All @@ -26,22 +84,17 @@ func gcscan_m() {
// runtime·restartg(mastergp) to make it Grunnable.
// At the bottom we will want to return this p back to the scheduler.

// Snapshot of allglen. During concurrent scan, we just need
// to be consistent about how many markroot jobs we create and
// how many Gs we check. Gs may be created after this and
// they'll be scanned during mark termination. During mark
// termination, allglen isn't changing.
local_allglen := int(atomicloaduintptr(&allglen))
nroots := gcMarkRootPrepare()

work.ndone = 0
useOneP := uint32(1) // For now do not do this in parallel.
// ackgcphase is not needed since we are not scanning running goroutines.
parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
parforsetup(work.markfor, useOneP, uint32(nroots), false, markroot)
parfordo(work.markfor)

lock(&allglock)
// Check that gc work is done.
for i := 0; i < local_allglen; i++ {
for i := 0; i < work.nStackRoots; i++ {
gp := allgs[i]
if !gp.gcscandone {
throw("scan missed a g")
Expand All @@ -61,40 +114,43 @@ func markroot(desc *parfor, i uint32) {
// TODO: Consider using getg().m.p.ptr().gcw.
var gcw gcWork

baseData := uint32(fixedRootCount)
baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots)

// Note: if you add a case here, please also update heapdump.go:dumproots.
switch i {
case _RootData:
switch {
case baseData <= i && i < baseBSS:
for datap := &firstmoduledata; datap != nil; datap = datap.next {
scanblock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw)
markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw, int(i-baseData))
}

case _RootBss:
case baseBSS <= i && i < baseSpans:
for datap := &firstmoduledata; datap != nil; datap = datap.next {
scanblock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw)
markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw, int(i-baseBSS))
}

case _RootFinalizers:
case i == fixedRootFinalizers:
for fb := allfin; fb != nil; fb = fb.alllink {
scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw)
}

case _RootFlushCaches:
case i == fixedRootFlushCaches:
if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
flushallmcaches()
}

default:
if _RootSpans0 <= i && i < _RootSpans0+_RootSpansShards {
// mark MSpan.specials
markrootSpans(&gcw, int(i)-_RootSpans0)
break
}
case baseSpans <= i && i < baseStacks:
// mark MSpan.specials
markrootSpans(&gcw, int(i-baseSpans))

default:
// the rest is scanning goroutine stacks
if uintptr(i-_RootCount) >= allglen {
if uintptr(i-baseStacks) >= allglen {
throw("markroot: bad index")
}
gp := allgs[i-_RootCount]
gp := allgs[i-baseStacks]

// remember when we've first observed the G blocked
// needed only to output in traceback
Expand All @@ -117,8 +173,31 @@ func markroot(desc *parfor, i uint32) {
gcw.dispose()
}

// markrootSpans marks roots for one shard (out of _RootSpansShards)
// of work.spans.
// markrootBlock scans the shard'th shard of the block of memory [b0,
// b0+n0), with the given pointer mask.
//
//go:nowritebarrier
func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
if rootBlockBytes%(8*ptrSize) != 0 {
// This is necessary to pick byte offsets in ptrmask0.
throw("rootBlockBytes must be a multiple of 8*ptrSize")
}

b := b0 + uintptr(shard)*rootBlockBytes
if b >= b0+n0 {
return
}
ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*ptrSize))))
n := uintptr(rootBlockBytes)
if b+n > b0+n0 {
n = b0 + n0 - b
}

// Scan this shard.
scanblock(b, n, ptrmask, gcw)
}

// markrootSpans marks roots for one shard of work.spans.
//
//go:nowritebarrier
func markrootSpans(gcw *gcWork, shard int) {
Expand Down Expand Up @@ -146,8 +225,11 @@ func markrootSpans(gcw *gcWork, shard int) {
}

sg := mheap_.sweepgen
startSpan := shard * len(work.spans) / _RootSpansShards
endSpan := (shard + 1) * len(work.spans) / _RootSpansShards
startSpan := shard * rootBlockSpans
endSpan := (shard + 1) * rootBlockSpans
if endSpan > len(work.spans) {
endSpan = len(work.spans)
}
// Note that work.spans may not include spans that were
// allocated between entering the scan phase and now. This is
// okay because any objects with finalizers in those spans
Expand Down

0 comments on commit d3df04c

Please sign in to comment.