diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index c7097e290698f4..a612c081d229c6 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2174,6 +2174,9 @@ top:
 	// Steal work from other P's.
 	procs := uint32(gomaxprocs)
 	ranTimer := false
+	if procs == 1 {
+		goto stop
+	}
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
@@ -2184,47 +2187,55 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	for {
+		retry := false
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
-			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
+			gp, retryP := runqsteal(_p_, p2, now)
+			if gp != nil {
 				return gp, false
 			}
-
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so avoid
-			// grabbing the lock if p2 is running and not marked
-			// for preemption. If p2 is running and not being
-			// preempted we assume it will handle its own timers.
-			if i > 2 && shouldStealTimers(p2) {
-				tnow, w, ran := checkTimers(p2, now)
-				now = tnow
-				if w != 0 && (pollUntil == 0 || w < pollUntil) {
-					pollUntil = w
-				}
-				if ran {
-					// Running the timers may have
-					// made an arbitrary number of G's
-					// ready and added them to this P's
-					// local run queue. That invalidates
-					// the assumption of runqsteal
-					// that is always has room to add
-					// stolen G's. So check now if there
-					// is a local G to run.
-					if gp, inheritTime := runqget(_p_); gp != nil {
-						return gp, inheritTime
-					}
-					ranTimer = true
+			if retryP {
+				retry = true
+			}
+		}
+		if !retry {
+			break
+		}
+	}
+	for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
+		if sched.gcwaiting != 0 {
+			goto top
+		}
+		p2 := allp[enum.position()]
+		if _p_ == p2 {
+			continue
+		}
+		// This call to checkTimers is the only place where
+		// we hold a lock on a different P's timers.
+		// Lock contention can be a problem here, so avoid
+		// grabbing the lock if p2 is running and not marked
+		// for preemption. If p2 is running and not being
+		// preempted we assume it will handle its own timers.
+		if shouldStealTimers(p2) {
+			tnow, w, ran := checkTimers(p2, now)
+			now = tnow
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+			if ran {
+				// Running the timers may have made an arbitrary number of G's
+				// ready and added them to this P's local run queue.
+				if gp, inheritTime := runqget(_p_); gp != nil {
+					return gp, inheritTime
 				}
+				ranTimer = true
 			}
 		}
 	}
@@ -5023,6 +5034,7 @@ func runqput(_p_ *p, gp *g, next bool) {
 	}
 
 	if next {
+		atomic.Store64(&_p_.runnextready, uint64(nanotime()))
 	retryNext:
 		oldnext := _p_.runnext
 		if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
@@ -5154,46 +5166,39 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 
 // Grabs a batch of goroutines from _p_'s runnable queue into batch.
 // Batch is a ring buffer starting at batchHead.
-// Returns number of grabbed goroutines.
+// Returns number of grabbed goroutines, and true if caller should retry later.
 // Can be executed by any P.
-func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
+func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, start int64) (uint32, bool) {
 	for {
 		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
 		t := atomic.LoadAcq(&_p_.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
-			if stealRunNextG {
-				// Try to steal from _p_.runnext.
-				if next := _p_.runnext; next != 0 {
-					if _p_.status == _Prunning {
-						// Sleep to ensure that _p_ isn't about to run the g
-						// we are about to steal.
-						// The important use case here is when the g running
-						// on _p_ ready()s another g and then almost
-						// immediately blocks. Instead of stealing runnext
-						// in this window, back off to give _p_ a chance to
-						// schedule runnext. This will avoid thrashing gs
-						// between different Ps.
-						// A sync chan send/recv takes ~50ns as of time of
-						// writing, so 3us gives ~50x overshoot.
-						if GOOS != "windows" {
-							usleep(3)
-						} else {
-							// On windows system timer granularity is
-							// 1-15ms, which is way too much for this
-							// optimization. So just yield.
-							osyield()
-						}
-					}
-					if !_p_.runnext.cas(next, 0) {
-						continue
-					}
-					batch[batchHead%uint32(len(batch))] = next
-					return 1
-				}
+			// Try to steal from _p_.runnext.
+			next := _p_.runnext
+			if next == 0 {
+				return 0, false
+			}
+			// If next became ready after start, it is ineligible for stealing
+			// (to prevent findrunnable() from spinning forever).
+			nextready := int64(atomic.Load64(&_p_.runnextready))
+			if start < nextready {
+				return 0, false
 			}
-			return 0
+			// Otherwise, next is only stealable some time after it is readied.
+			// This ensures that when one G readies another and then
+			// immediately blocks or exits, its P has time to schedule the
+			// second G. A sync chan send/recv takes 50ns as of this time of
+			// writing, so 4us gives us ~80x overshoot.
+			if nanotime() < (nextready + 4000) {
+				return 0, true
+			}
+			if !_p_.runnext.cas(next, 0) {
+				continue
+			}
+			batch[batchHead%uint32(len(batch))] = next
+			return 1, false
 		}
 		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
 			continue
@@ -5203,31 +5208,32 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			batch[(batchHead+i)%uint32(len(batch))] = g
 		}
 		if atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
-			return n
+			return n, false
 		}
 	}
 }
 
 // Steal half of elements from local runnable queue of p2
 // and put onto local runnable queue of p.
-// Returns one of the stolen elements (or nil if failed).
-func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
+// Returns one of the stolen elements (or nil if failed), and true if caller
+// should retry later.
+func runqsteal(_p_, p2 *p, start int64) (*g, bool) {
 	t := _p_.runqtail
-	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
+	n, retry := runqgrab(p2, &_p_.runq, t, start)
 	if n == 0 {
-		return nil
+		return nil, retry
 	}
 	n--
 	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))].ptr()
 	if n == 0 {
-		return gp
+		return gp, false
 	}
 	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
 	if t-h+n >= uint32(len(_p_.runq)) {
 		throw("runqsteal: runq overflow")
 	}
 	atomic.StoreRel(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
-	return gp
+	return gp, false
 }
 
 // A gQueue is a dequeue of Gs linked through g.schedlink. A G can only
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 1a98927647d4c8..a5b38c4a3e276b 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -580,7 +580,8 @@ type p struct {
 	// unit and eliminates the (potentially large) scheduling
 	// latency that otherwise arises from adding the ready'd
 	// goroutines to the end of the run queue.
-	runnext guintptr
+	runnext      guintptr
+	runnextready uint64 // nanotime() when runnext readied
 
 	// Available G's (status == Gdead)
 	gFree struct {