From d3df04cd8cf96db47d986cf3dcf3c7b2ba8238ea Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Fri, 16 Oct 2015 16:52:26 -0400 Subject: [PATCH] runtime: partition data and BSS root marking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently data and BSS root marking are each a single markroot job. This makes them difficult to load balance, which can draw out mark termination time if they are large. Fix this by splitting both in to 256K chunks. While we're putting in the infrastructure for dynamic roots, we also replace the fixed sharding of the span roots with sharding in to fixed sizes. In addition to helping balance root marking, this also paves the way to parallelizing concurrent scan and to letting assists help with root marking. Updates #10345. This fixes the data and BSS aspects of that bug; it does not partition scanning of large heap objects. This has negligible effect on either the go1 benchmarks or the garbage benchmark: name old time/op new time/op delta XBenchGarbage-12 4.90ms ± 1% 4.91ms ± 2% ~ (p=0.058 n=17+16) name old time/op new time/op delta BinaryTree17-12 3.11s ± 4% 3.12s ± 4% ~ (p=0.512 n=20+20) Fannkuch11-12 2.53s ± 2% 2.47s ± 2% -2.28% (p=0.000 n=20+18) FmtFprintfEmpty-12 49.1ns ± 1% 50.0ns ± 4% +1.68% (p=0.008 n=18+20) FmtFprintfString-12 170ns ± 0% 172ns ± 1% +1.05% (p=0.000 n=14+19) FmtFprintfInt-12 174ns ± 1% 162ns ± 1% -6.81% (p=0.000 n=18+17) FmtFprintfIntInt-12 284ns ± 1% 277ns ± 1% -2.42% (p=0.000 n=20+19) FmtFprintfPrefixedInt-12 252ns ± 1% 244ns ± 1% -2.84% (p=0.000 n=18+20) FmtFprintfFloat-12 317ns ± 0% 311ns ± 0% -1.95% (p=0.000 n=19+18) FmtManyArgs-12 1.08µs ± 1% 1.11µs ± 1% +3.43% (p=0.000 n=18+19) GobDecode-12 8.56ms ± 1% 8.61ms ± 1% +0.50% (p=0.020 n=20+20) GobEncode-12 6.58ms ± 1% 6.57ms ± 1% ~ (p=0.792 n=20+19) Gzip-12 317ms ± 3% 317ms ± 2% ~ (p=0.840 n=19+19) Gunzip-12 41.6ms ± 0% 41.6ms ± 0% +0.07% (p=0.027 n=18+15) HTTPClientServer-12 62.2µs ± 1% 62.3µs ± 1% ~ (p=0.283 n=19+20) JSONEncode-12 16.5ms ± 2% 16.5ms ± 1% ~ (p=0.857 n=20+19) JSONDecode-12 58.5ms ± 1% 61.3ms ± 1% +4.67% (p=0.000 n=18+17) Mandelbrot200-12 3.84ms ± 0% 3.84ms ± 0% ~ (p=0.259 n=17+17) GoParse-12 3.70ms ± 2% 3.74ms ± 2% +0.96% (p=0.009 n=19+20) RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 0% +0.31% (p=0.040 n=19+15) RegexpMatchEasy0_1K-12 340ns ± 1% 340ns ± 1% ~ (p=0.411 n=17+19) RegexpMatchEasy1_32-12 82.7ns ± 2% 82.3ns ± 1% ~ (p=0.456 n=20+19) RegexpMatchEasy1_1K-12 498ns ± 2% 495ns ± 0% ~ (p=0.108 n=19+17) RegexpMatchMedium_32-12 130ns ± 1% 130ns ± 2% ~ (p=0.405 n=18+19) RegexpMatchMedium_1K-12 39.4µs ± 2% 39.1µs ± 1% -0.64% (p=0.002 n=20+19) RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 0% ~ (p=0.561 n=20+17) RegexpMatchHard_1K-12 61.1µs ± 2% 60.8µs ± 1% ~ (p=0.615 n=19+18) Revcomp-12 532ms ± 2% 531ms ± 1% ~ (p=0.470 n=19+19) Template-12 68.5ms ± 1% 69.1ms ± 1% +0.87% (p=0.000 n=17+17) TimeParse-12 344ns ± 2% 344ns ± 1% +0.25% (p=0.032 n=19+18) TimeFormat-12 347ns ± 1% 362ns ± 1% +4.27% (p=0.000 n=17+19) [Geo mean] 62.3µs 62.3µs -0.04% name old speed new speed delta GobDecode-12 89.6MB/s ± 1% 89.2MB/s ± 1% -0.50% (p=0.019 n=20+20) GobEncode-12 117MB/s ± 1% 117MB/s ± 1% ~ (p=0.797 n=20+19) Gzip-12 61.3MB/s ± 3% 61.2MB/s ± 2% ~ (p=0.834 n=19+19) Gunzip-12 467MB/s ± 0% 466MB/s ± 0% -0.07% (p=0.027 n=18+15) JSONEncode-12 117MB/s ± 2% 117MB/s ± 1% ~ (p=0.851 n=20+19) JSONDecode-12 33.2MB/s ± 1% 31.7MB/s ± 1% -4.47% (p=0.000 n=18+17) GoParse-12 15.6MB/s ± 2% 15.5MB/s ± 2% -0.95% (p=0.008 n=19+20) RegexpMatchEasy0_32-12 321MB/s ± 2% 320MB/s ± 1% -0.57% (p=0.002 n=17+17) RegexpMatchEasy0_1K-12 3.01GB/s ± 1% 3.01GB/s ± 1% ~ (p=0.132 n=17+18) RegexpMatchEasy1_32-12 387MB/s ± 2% 389MB/s ± 1% ~ (p=0.423 n=20+19) RegexpMatchEasy1_1K-12 2.05GB/s ± 2% 2.06GB/s ± 0% ~ (p=0.129 n=19+17) RegexpMatchMedium_32-12 7.64MB/s ± 1% 7.66MB/s ± 1% ~ (p=0.258 n=18+19) RegexpMatchMedium_1K-12 26.0MB/s ± 2% 26.2MB/s ± 1% +0.64% (p=0.002 n=20+19) RegexpMatchHard_32-12 15.7MB/s ± 2% 15.8MB/s ± 1% ~ (p=0.510 n=20+17) RegexpMatchHard_1K-12 16.8MB/s ± 2% 16.8MB/s ± 1% ~ (p=0.603 n=19+18) Revcomp-12 477MB/s ± 2% 479MB/s ± 1% ~ (p=0.470 n=19+19) Template-12 28.3MB/s ± 1% 28.1MB/s ± 1% -0.85% (p=0.000 n=17+17) [Geo mean] 100MB/s 100MB/s -0.26% Change-Id: Ib0bfe0145675ce88c5a8791752f7486ac98805b4 Reviewed-on: https://go-review.googlesource.com/16043 Reviewed-by: Rick Hudson --- src/runtime/mgc.go | 21 +++---- src/runtime/mgcmark.go | 136 +++++++++++++++++++++++++++++++++-------- 2 files changed, 119 insertions(+), 38 deletions(-) diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go index aa7714591de72..8bba5268535e8 100644 --- a/src/runtime/mgc.go +++ b/src/runtime/mgc.go @@ -127,14 +127,6 @@ const ( _ConcurrentSweep = true _FinBlockSize = 4 * 1024 - _RootData = 0 - _RootBss = 1 - _RootFinalizers = 2 - _RootFlushCaches = 3 - _RootSpans0 = 4 - _RootSpansShards = 128 - _RootCount = _RootSpans0 + _RootSpansShards - // sweepMinHeapDistance is a lower bound on the heap distance // (in bytes) reserved for concurrent sweeping between GC // cycles. This will be scaled by gcpercent/100. @@ -804,6 +796,9 @@ var work struct { alldone note markfor *parfor + // Number of roots of various root types. Set by gcMarkRootPrepare. + nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int + // finalizersDone indicates that finalizers and objects with // finalizers have been scanned by markroot. During concurrent // GC, this happens during the concurrent scan phase. During @@ -1060,8 +1055,9 @@ func gc(mode gcMode) { // barriers. Rescan some roots and flush work caches. systemstack(func() { // rescan global data and bss. - markroot(nil, _RootData) - markroot(nil, _RootBss) + for i := fixedRootCount; i < fixedRootCount+work.nDataRoots+work.nBSSRoots; i++ { + markroot(nil, uint32(i)) + } // Disallow caching workbufs. gcBlackenPromptly = true @@ -1460,6 +1456,9 @@ func gcMark(start_time int64) { // but must be disposed to the global lists immediately. gcFlushGCWork() + // Queue root marking jobs. + nRoots := gcMarkRootPrepare() + work.nwait = 0 work.ndone = 0 work.nproc = uint32(gcprocs()) @@ -1468,7 +1467,7 @@ func gcMark(start_time int64) { traceGCScanStart() } - parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), false, markroot) + parforsetup(work.markfor, work.nproc, uint32(nRoots), false, markroot) if work.nproc > 1 { noteclear(&work.alldone) helpgc(int32(work.nproc)) diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 93018207d69c2..4ec428d9144f3 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -8,6 +8,64 @@ package runtime import "unsafe" +const ( + fixedRootFinalizers = iota + fixedRootFlushCaches + fixedRootCount + + // rootBlockBytes is the number of bytes to scan per data or + // BSS root. + rootBlockBytes = 256 << 10 + + // rootBlockSpans is the number of spans to scan per span + // root. + rootBlockSpans = 8 * 1024 // 64MB worth of spans +) + +// gcMarkRootPrepare initializes scanning-related state and returns +// the number of roots. +// +// The caller must have call gcCopySpans(). +// +//go:nowritebarrier +func gcMarkRootPrepare() int { + // Compute how many data and BSS root blocks there are. + nBlocks := func(bytes uintptr) int { + return int((bytes + rootBlockBytes - 1) / rootBlockBytes) + } + + work.nDataRoots = 0 + for datap := &firstmoduledata; datap != nil; datap = datap.next { + nDataRoots := nBlocks(datap.edata - datap.data) + if nDataRoots > work.nDataRoots { + work.nDataRoots = nDataRoots + } + } + + work.nBSSRoots = 0 + for datap := &firstmoduledata; datap != nil; datap = datap.next { + nBSSRoots := nBlocks(datap.ebss - datap.bss) + if nBSSRoots > work.nBSSRoots { + work.nBSSRoots = nBSSRoots + } + } + + // Compute number of span roots. + work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans + + // Snapshot of allglen. During concurrent scan, we just need + // to be consistent about how many markroot jobs we create and + // how many Gs we check. Gs may be created after this point, + // but it's okay that we ignore them because they begin life + // without any roots, so there's nothing to scan, and any + // roots they create during the concurrent phase will be + // scanned during mark termination. During mark termination, + // allglen isn't changing, so we'll scan all Gs. + work.nStackRoots = int(atomicloaduintptr(&allglen)) + + return fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots +} + // Scan all of the stacks, greying (or graying if in America) the referents // but not blackening them since the mark write barrier isn't installed. //go:nowritebarrier @@ -26,22 +84,17 @@ func gcscan_m() { // runtime·restartg(mastergp) to make it Grunnable. // At the bottom we will want to return this p back to the scheduler. - // Snapshot of allglen. During concurrent scan, we just need - // to be consistent about how many markroot jobs we create and - // how many Gs we check. Gs may be created after this and - // they'll be scanned during mark termination. During mark - // termination, allglen isn't changing. - local_allglen := int(atomicloaduintptr(&allglen)) + nroots := gcMarkRootPrepare() work.ndone = 0 useOneP := uint32(1) // For now do not do this in parallel. // ackgcphase is not needed since we are not scanning running goroutines. - parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot) + parforsetup(work.markfor, useOneP, uint32(nroots), false, markroot) parfordo(work.markfor) lock(&allglock) // Check that gc work is done. - for i := 0; i < local_allglen; i++ { + for i := 0; i < work.nStackRoots; i++ { gp := allgs[i] if !gp.gcscandone { throw("scan missed a g") @@ -61,40 +114,43 @@ func markroot(desc *parfor, i uint32) { // TODO: Consider using getg().m.p.ptr().gcw. var gcw gcWork + baseData := uint32(fixedRootCount) + baseBSS := baseData + uint32(work.nDataRoots) + baseSpans := baseBSS + uint32(work.nBSSRoots) + baseStacks := baseSpans + uint32(work.nSpanRoots) + // Note: if you add a case here, please also update heapdump.go:dumproots. - switch i { - case _RootData: + switch { + case baseData <= i && i < baseBSS: for datap := &firstmoduledata; datap != nil; datap = datap.next { - scanblock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw) + markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw, int(i-baseData)) } - case _RootBss: + case baseBSS <= i && i < baseSpans: for datap := &firstmoduledata; datap != nil; datap = datap.next { - scanblock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw) + markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw, int(i-baseBSS)) } - case _RootFinalizers: + case i == fixedRootFinalizers: for fb := allfin; fb != nil; fb = fb.alllink { scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw) } - case _RootFlushCaches: + case i == fixedRootFlushCaches: if gcphase != _GCscan { // Do not flush mcaches during GCscan phase. flushallmcaches() } - default: - if _RootSpans0 <= i && i < _RootSpans0+_RootSpansShards { - // mark MSpan.specials - markrootSpans(&gcw, int(i)-_RootSpans0) - break - } + case baseSpans <= i && i < baseStacks: + // mark MSpan.specials + markrootSpans(&gcw, int(i-baseSpans)) + default: // the rest is scanning goroutine stacks - if uintptr(i-_RootCount) >= allglen { + if uintptr(i-baseStacks) >= allglen { throw("markroot: bad index") } - gp := allgs[i-_RootCount] + gp := allgs[i-baseStacks] // remember when we've first observed the G blocked // needed only to output in traceback @@ -117,8 +173,31 @@ func markroot(desc *parfor, i uint32) { gcw.dispose() } -// markrootSpans marks roots for one shard (out of _RootSpansShards) -// of work.spans. +// markrootBlock scans the shard'th shard of the block of memory [b0, +// b0+n0), with the given pointer mask. +// +//go:nowritebarrier +func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) { + if rootBlockBytes%(8*ptrSize) != 0 { + // This is necessary to pick byte offsets in ptrmask0. + throw("rootBlockBytes must be a multiple of 8*ptrSize") + } + + b := b0 + uintptr(shard)*rootBlockBytes + if b >= b0+n0 { + return + } + ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*ptrSize)))) + n := uintptr(rootBlockBytes) + if b+n > b0+n0 { + n = b0 + n0 - b + } + + // Scan this shard. + scanblock(b, n, ptrmask, gcw) +} + +// markrootSpans marks roots for one shard of work.spans. // //go:nowritebarrier func markrootSpans(gcw *gcWork, shard int) { @@ -146,8 +225,11 @@ func markrootSpans(gcw *gcWork, shard int) { } sg := mheap_.sweepgen - startSpan := shard * len(work.spans) / _RootSpansShards - endSpan := (shard + 1) * len(work.spans) / _RootSpansShards + startSpan := shard * rootBlockSpans + endSpan := (shard + 1) * rootBlockSpans + if endSpan > len(work.spans) { + endSpan = len(work.spans) + } // Note that work.spans may not include spans that were // allocated between entering the scan phase and now. This is // okay because any objects with finalizers in those spans