diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 61cf00942..2d6c4dab6 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -789,6 +789,13 @@ bool ImageWriter::PruneAppImageClassInternal(
   result = result || PruneAppImageClassInternal(klass->GetSuperClass(),
                                                 &my_early_exit,
                                                 visited);
+  // Remove the class if the dex file is not in the set of dex files. This happens for classes that
+  // are from uses library if there is no profile. b/30688277
+  mirror::DexCache* dex_cache = klass->GetDexCache();
+  if (dex_cache != nullptr) {
+    result = result ||
+        dex_file_oat_index_map_.find(dex_cache->GetDexFile()) == dex_file_oat_index_map_.end();
+  }
   // Erase the element we stored earlier since we are exiting the function.
   auto it = visited->find(klass);
   DCHECK(it != visited->end());
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index a34e02908..a88291f28 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1848,7 +1848,7 @@ void ClassLinker::VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags) {
     boot_class_table_.VisitRoots(buffered_visitor);
 
     // If tracing is enabled, then mark all the class loaders to prevent unloading.
-    if (tracing_enabled) {
+    if ((flags & kVisitRootFlagClassLoader) != 0 || tracing_enabled) {
       for (const ClassLoaderData& data : class_loaders_) {
         GcRoot<mirror::Object> root(GcRoot<mirror::Object>(self->DecodeJObject(data.weak_root)));
         root.VisitRoot(visitor, RootInfo(kRootVMInternal));
@@ -2409,6 +2409,14 @@ mirror::Class* ClassLinker::FindClass(Thread* self,
 
       // We'll let the Java-side rediscover all this and throw the exception with the right stack
       // trace.
+      if (!self->CanCallIntoJava()) {
+        // Oops, we can't call into java so we can't run actual class-loader code.
+        // This is true for e.g. for the compiler (jit or aot).
+        mirror::Throwable* pre_allocated =
+            Runtime::Current()->GetPreAllocatedNoClassDefFoundError();
+        self->SetException(pre_allocated);
+        return nullptr;
+      }
     }
 
     if (Runtime::Current()->IsAotCompiler()) {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 24cbf10d6..4081ec7a7 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -627,8 +627,7 @@ void MarkSweep::MarkNonThreadRoots() {
 void MarkSweep::MarkConcurrentRoots(VisitRootFlags flags) {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Visit all runtime roots and clear dirty flags.
-  Runtime::Current()->VisitConcurrentRoots(
-      this, static_cast<VisitRootFlags>(flags | kVisitRootFlagNonMoving));
+  Runtime::Current()->VisitConcurrentRoots(this, flags);
 }
 
 class MarkSweep::DelayReferenceReferentVisitor {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 92d7cf605..f4c7ba6c9 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -98,7 +98,7 @@ class MarkSweep : public GarbageCollector {
       REQUIRES(!mark_stack_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  void MarkConcurrentRoots(VisitRootFlags flags)
+  virtual void MarkConcurrentRoots(VisitRootFlags flags)
       REQUIRES(Locks::heap_bitmap_lock_)
       REQUIRES(!mark_stack_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
diff --git a/runtime/gc/collector/sticky_mark_sweep.cc b/runtime/gc/collector/sticky_mark_sweep.cc
index bb7e854ea..a2dbe3f7a 100644
--- a/runtime/gc/collector/sticky_mark_sweep.cc
+++ b/runtime/gc/collector/sticky_mark_sweep.cc
@@ -56,6 +56,19 @@ void StickyMarkSweep::MarkReachableObjects() {
   RecursiveMarkDirtyObjects(false, accounting::CardTable::kCardDirty - 1);
 }
 
+void StickyMarkSweep::MarkConcurrentRoots(VisitRootFlags flags) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // Visit all runtime roots and clear dirty flags including class loader. This is done to prevent
+  // incorrect class unloading since the GC does not card mark when storing store the class during
+  // object allocation. Doing this for each allocation would be slow.
+  // Since the card is not dirty, it means the object may not get scanned. This can cause class
+  // unloading to occur even though the class and class loader are reachable through the object's
+  // class.
+  Runtime::Current()->VisitConcurrentRoots(
+      this,
+      static_cast<VisitRootFlags>(flags | kVisitRootFlagClassLoader));
+}
+
 void StickyMarkSweep::Sweep(bool swap_bitmaps ATTRIBUTE_UNUSED) {
   SweepArray(GetHeap()->GetLiveStack(), false);
 }
diff --git a/runtime/gc/collector/sticky_mark_sweep.h b/runtime/gc/collector/sticky_mark_sweep.h
index abaf97845..93b0b5fa7 100644
--- a/runtime/gc/collector/sticky_mark_sweep.h
+++ b/runtime/gc/collector/sticky_mark_sweep.h
@@ -33,6 +33,12 @@ class StickyMarkSweep FINAL : public PartialMarkSweep {
   StickyMarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
   ~StickyMarkSweep() {}
 
+  void MarkConcurrentRoots(VisitRootFlags flags)
+      OVERRIDE
+      REQUIRES(!mark_stack_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  protected:
   // Bind the live bits to the mark bits of bitmaps for all spaces, all spaces other than the
   // alloc space will be marked as immune.
diff --git a/runtime/interpreter/mterp/arm64/footer.S b/runtime/interpreter/mterp/arm64/footer.S
index 2d3a11eaf..dbcbc7146 100644
--- a/runtime/interpreter/mterp/arm64/footer.S
+++ b/runtime/interpreter/mterp/arm64/footer.S
@@ -267,13 +267,7 @@ MterpExceptionReturn:
     b MterpDone
 MterpReturn:
     ldr     x2, [xFP, #OFF_FP_RESULT_REGISTER]
-    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
     str     x0, [x2]
-    mov     x0, xSELF
-    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    b.eq    check2
-    bl      MterpSuspendCheck                       // (self)
-check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
 /*
diff --git a/runtime/interpreter/mterp/mips64/footer.S b/runtime/interpreter/mterp/mips64/footer.S
index 14d5fe01f..0545194db 100644
--- a/runtime/interpreter/mterp/mips64/footer.S
+++ b/runtime/interpreter/mterp/mips64/footer.S
@@ -134,13 +134,7 @@ MterpExceptionReturn:
  */
 MterpReturn:
     ld      a2, OFF_FP_RESULT_REGISTER(rFP)
-    lw      ra, THREAD_FLAGS_OFFSET(rSELF)
     sd      a0, 0(a2)
-    move    a0, rSELF
-    and     ra, ra, (THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    beqzc   ra, check2
-    jal     MterpSuspendCheck                       # (self)
-check2:
     li      v0, 1                                   # signal return to caller.
 MterpDone:
     ld      s5, STACK_OFFSET_S5(sp)
diff --git a/runtime/interpreter/mterp/out/mterp_arm64.S b/runtime/interpreter/mterp/out/mterp_arm64.S
index 55797e676..33c1abd86 100644
--- a/runtime/interpreter/mterp/out/mterp_arm64.S
+++ b/runtime/interpreter/mterp/out/mterp_arm64.S
@@ -11554,13 +11554,7 @@ MterpExceptionReturn:
     b MterpDone
 MterpReturn:
     ldr     x2, [xFP, #OFF_FP_RESULT_REGISTER]
-    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
     str     x0, [x2]
-    mov     x0, xSELF
-    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    b.eq    check2
-    bl      MterpSuspendCheck                       // (self)
-check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
 /*
diff --git a/runtime/interpreter/mterp/out/mterp_mips64.S b/runtime/interpreter/mterp/out/mterp_mips64.S
index a17252b2f..b96b4bdd1 100644
--- a/runtime/interpreter/mterp/out/mterp_mips64.S
+++ b/runtime/interpreter/mterp/out/mterp_mips64.S
@@ -12386,13 +12386,7 @@ MterpExceptionReturn:
  */
 MterpReturn:
     ld      a2, OFF_FP_RESULT_REGISTER(rFP)
-    lw      ra, THREAD_FLAGS_OFFSET(rSELF)
     sd      a0, 0(a2)
-    move    a0, rSELF
-    and     ra, ra, (THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    beqzc   ra, check2
-    jal     MterpSuspendCheck                       # (self)
-check2:
     li      v0, 1                                   # signal return to caller.
 MterpDone:
     ld      s5, STACK_OFFSET_S5(sp)
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 06b67b3e4..0d862fe67 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -621,8 +621,8 @@ void JdwpState::SendRequestAndPossiblySuspend(ExpandBuf* pReq, JdwpSuspendPolicy
   Thread* const self = Thread::Current();
   self->AssertThreadSuspensionIsAllowable();
   CHECK(pReq != nullptr);
+  CHECK_EQ(threadId, Dbg::GetThreadSelfId()) << "Only the current thread can suspend itself";
   /* send request and possibly suspend ourselves */
-  JDWP::ObjectId thread_self_id = Dbg::GetThreadSelfId();
   ScopedThreadSuspension sts(self, kWaitingForDebuggerSend);
   if (suspend_policy != SP_NONE) {
     AcquireJdwpTokenForEvent(threadId);
@@ -631,7 +631,7 @@ void JdwpState::SendRequestAndPossiblySuspend(ExpandBuf* pReq, JdwpSuspendPolicy
   {
     // Before suspending, we change our state to kSuspended so the debugger sees us as RUNNING.
     ScopedThreadStateChange stsc(self, kSuspended);
-    SuspendByPolicy(suspend_policy, thread_self_id);
+    SuspendByPolicy(suspend_policy, threadId);
   }
 }
 
@@ -658,13 +658,10 @@ void JdwpState::ReleaseJdwpTokenForCommand() {
 }
 
 void JdwpState::AcquireJdwpTokenForEvent(ObjectId threadId) {
-  CHECK_NE(Thread::Current(), GetDebugThread()) << "Expected event thread";
-  CHECK_NE(debug_thread_id_, threadId) << "Not expected debug thread";
   SetWaitForJdwpToken(threadId);
 }
 
 void JdwpState::ReleaseJdwpTokenForEvent() {
-  CHECK_NE(Thread::Current(), GetDebugThread()) << "Expected event thread";
   ClearWaitForJdwpToken();
 }
 
@@ -685,23 +682,28 @@ void JdwpState::SetWaitForJdwpToken(ObjectId threadId) {
   /* this is held for very brief periods; contention is unlikely */
   MutexLock mu(self, jdwp_token_lock_);
 
-  CHECK_NE(jdwp_token_owner_thread_id_, threadId) << "Thread is already holding event thread lock";
+  if (jdwp_token_owner_thread_id_ == threadId) {
+    // Only the debugger thread may already hold the event token. For instance, it may trigger
+    // a CLASS_PREPARE event while processing a command that initializes a class.
+    CHECK_EQ(threadId, debug_thread_id_) << "Non-debugger thread is already holding event token";
+  } else {
+    /*
+     * If another thread is already doing stuff, wait for it.  This can
+     * go to sleep indefinitely.
+     */
 
-  /*
-   * If another thread is already doing stuff, wait for it.  This can
-   * go to sleep indefinitely.
-   */
-  while (jdwp_token_owner_thread_id_ != 0) {
-    VLOG(jdwp) << StringPrintf("event in progress (%#" PRIx64 "), %#" PRIx64 " sleeping",
-                               jdwp_token_owner_thread_id_, threadId);
-    waited = true;
-    jdwp_token_cond_.Wait(self);
-  }
+    while (jdwp_token_owner_thread_id_ != 0) {
+      VLOG(jdwp) << StringPrintf("event in progress (%#" PRIx64 "), %#" PRIx64 " sleeping",
+                                 jdwp_token_owner_thread_id_, threadId);
+      waited = true;
+      jdwp_token_cond_.Wait(self);
+    }
 
-  if (waited || threadId != debug_thread_id_) {
-    VLOG(jdwp) << StringPrintf("event token grabbed (%#" PRIx64 ")", threadId);
+    if (waited || threadId != debug_thread_id_) {
+      VLOG(jdwp) << StringPrintf("event token grabbed (%#" PRIx64 ")", threadId);
+    }
+    jdwp_token_owner_thread_id_ = threadId;
   }
-  jdwp_token_owner_thread_id_ = threadId;
 }
 
 /*
@@ -1224,14 +1226,15 @@ void JdwpState::PostClassPrepare(mirror::Class* klass) {
     VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
   }
 
-  if (thread_id == debug_thread_id_) {
+  ObjectId reported_thread_id = thread_id;
+  if (reported_thread_id == debug_thread_id_) {
     /*
      * JDWP says that, for a class prep in the debugger thread, we
      * should set thread to null and if any threads were supposed
      * to be suspended then we suspend all other threads.
      */
     VLOG(jdwp) << "  NOTE: class prepare in debugger thread!";
-    thread_id = 0;
+    reported_thread_id = 0;
     if (suspend_policy == SP_EVENT_THREAD) {
       suspend_policy = SP_ALL;
     }
@@ -1244,7 +1247,7 @@ void JdwpState::PostClassPrepare(mirror::Class* klass) {
   for (const JdwpEvent* pEvent : match_list) {
     expandBufAdd1(pReq, pEvent->eventKind);
     expandBufAdd4BE(pReq, pEvent->requestId);
-    expandBufAddObjectId(pReq, thread_id);
+    expandBufAddObjectId(pReq, reported_thread_id);
     expandBufAdd1(pReq, tag);
     expandBufAddRefTypeId(pReq, class_id);
     expandBufAddUtf8String(pReq, signature);
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index 74d99176c..8cd867d0b 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -145,7 +145,12 @@ Jit::Jit() : dump_info_on_shutdown_(false),
              memory_use_("Memory used for compilation", 16),
              lock_("JIT memory use lock"),
              use_jit_compilation_(true),
-             save_profiling_info_(false) {}
+             save_profiling_info_(false),
+             hot_method_threshold_(0),
+             warm_method_threshold_(0),
+             osr_method_threshold_(0),
+             priority_thread_weight_(0),
+             invoke_transition_weight_(0) {}
 
 Jit* Jit::Create(JitOptions* options, std::string* error_msg) {
   DCHECK(options->UseJitCompilation() || options->GetSaveProfilingInfo());
@@ -280,7 +285,11 @@ bool Jit::CompileMethod(ArtMethod* method, Thread* self, bool osr) {
 void Jit::CreateThreadPool() {
   // There is a DCHECK in the 'AddSamples' method to ensure the tread pool
   // is not null when we instrument.
-  thread_pool_.reset(new ThreadPool("Jit thread pool", 1));
+
+  // We need peers as we may report the JIT thread, e.g., in the debugger.
+  constexpr bool kJitPoolNeedsPeers = true;
+  thread_pool_.reset(new ThreadPool("Jit thread pool", 1, kJitPoolNeedsPeers));
+
   thread_pool_->SetPthreadPriority(kJitPoolThreadPthreadPriority);
   thread_pool_->StartWorkers(Thread::Current());
 }
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 8c1d7768c..d00ca5af6 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -342,10 +342,16 @@ uint8_t* JitCodeCache::CommitCodeInternal(Thread* self,
           core_spill_mask,
           fp_spill_mask,
           code_size);
+      // Flush caches before we remove write permission because on some ARMv8 hardware,
+      // flushing caches require write permissions.
+      //
+      // For reference, here are kernel patches discussing about this issue:
+      // https://android.googlesource.com/kernel/msm/%2B/0e7f7bcc3fc87489cda5aa6aff8ce40eed912279
+      // https://patchwork.kernel.org/patch/9047921/
+      FlushInstructionCache(reinterpret_cast<char*>(code_ptr),
+                            reinterpret_cast<char*>(code_ptr + code_size));
     }
 
-    FlushInstructionCache(reinterpret_cast<char*>(code_ptr),
-                          reinterpret_cast<char*>(code_ptr + code_size));
     number_of_compilations_++;
   }
   // We need to update the entry point in the runnable state for the instrumentation.
diff --git a/runtime/os_linux.cc b/runtime/os_linux.cc
index f45e9f603..b628846d6 100644
--- a/runtime/os_linux.cc
+++ b/runtime/os_linux.cc
@@ -54,7 +54,7 @@ File* OS::CreateEmptyFileWriteOnly(const char* name) {
 File* OS::OpenFileWithFlags(const char* name, int flags) {
   CHECK(name != nullptr);
   std::unique_ptr<File> file(new File);
-  if (!file->Open(name, flags, 0666)) {
+  if (!file->Open(name, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) {
     return nullptr;
   }
   return file.release();
diff --git a/runtime/reference_table.cc b/runtime/reference_table.cc
index d4bd6dd24..62e23ed92 100644
--- a/runtime/reference_table.cc
+++ b/runtime/reference_table.cc
@@ -215,33 +215,87 @@ void ReferenceTable::Dump(std::ostream& os, Table& entries) {
   }
   std::sort(sorted_entries.begin(), sorted_entries.end(), GcRootComparator());
 
-  // Dump a summary of the whole table.
-  os << "  Summary:\n";
-  size_t equiv = 0;
-  size_t identical = 0;
-  mirror::Object* prev = nullptr;
-  for (GcRoot<mirror::Object>& root : sorted_entries) {
-    mirror::Object* current = root.Read<kWithoutReadBarrier>();
-    if (prev != nullptr) {
-      const size_t element_count = GetElementCount(prev);
-      if (current == prev) {
+  class SummaryElement {
+   public:
+    GcRoot<mirror::Object> root;
+    size_t equiv;
+    size_t identical;
+
+    SummaryElement() : equiv(0), identical(0) {}
+    SummaryElement(SummaryElement&& ref) {
+      root = ref.root;
+      equiv = ref.equiv;
+      identical = ref.identical;
+    }
+    SummaryElement(const SummaryElement&) = default;
+    SummaryElement& operator=(SummaryElement&&) = default;
+
+    void Reset(GcRoot<mirror::Object>& _root) {
+      root = _root;
+      equiv = 0;
+      identical = 0;
+    }
+  };
+  std::vector<SummaryElement> sorted_summaries;
+  {
+    SummaryElement prev;
+
+    for (GcRoot<mirror::Object>& root : sorted_entries) {
+      mirror::Object* current = root.Read<kWithoutReadBarrier>();
+
+      if (UNLIKELY(prev.root.IsNull())) {
+        prev.Reset(root);
+        continue;
+      }
+
+      mirror::Object* prevObj = prev.root.Read<kWithoutReadBarrier>();
+      if (current == prevObj) {
         // Same reference, added more than once.
-        ++identical;
-      } else if (current->GetClass() == prev->GetClass() &&
-          GetElementCount(current) == element_count) {
+        ++prev.identical;
+      } else if (current->GetClass() == prevObj->GetClass() &&
+          GetElementCount(current) == GetElementCount(prevObj)) {
         // Same class / element count, different object.
-        ++equiv;
+        ++prev.equiv;
       } else {
-        // Different class.
-        DumpSummaryLine(os, prev, element_count, identical, equiv);
-        equiv = 0;
-        identical = 0;
+        sorted_summaries.push_back(prev);
+        prev.Reset(root);
       }
+      prev.root = root;
     }
-    prev = current;
+    sorted_summaries.push_back(prev);
+
+    // Compare summary elements, first by combined count, then by identical (indicating leaks),
+    // then by class (and size and address).
+    struct SummaryElementComparator {
+      GcRootComparator gc_root_cmp;
+
+      bool operator()(SummaryElement& elem1, SummaryElement& elem2) const
+          NO_THREAD_SAFETY_ANALYSIS {
+        Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+
+        size_t count1 = elem1.equiv + elem1.identical;
+        size_t count2 = elem2.equiv + elem2.identical;
+        if (count1 != count2) {
+          return count1 > count2;
+        }
+
+        if (elem1.identical != elem2.identical) {
+          return elem1.identical > elem2.identical;
+        }
+
+        // Otherwise, compare the GC roots as before.
+        return gc_root_cmp(elem1.root, elem2.root);
+      }
+    };
+    std::sort(sorted_summaries.begin(), sorted_summaries.end(), SummaryElementComparator());
+  }
+
+  // Dump a summary of the whole table.
+  os << "  Summary:\n";
+  for (SummaryElement& elem : sorted_summaries) {
+    mirror::Object* elemObj = elem.root.Read<kWithoutReadBarrier>();
+    DumpSummaryLine(os, elemObj, GetElementCount(elemObj), elem.identical, elem.equiv);
   }
-  // Handle the last entry.
-  DumpSummaryLine(os, prev, GetElementCount(prev), identical, equiv);
 }
 
 void ReferenceTable::VisitRoots(RootVisitor* visitor, const RootInfo& root_info) {
diff --git a/runtime/reference_table_test.cc b/runtime/reference_table_test.cc
index fae8e722c..c48edbe0c 100644
--- a/runtime/reference_table_test.cc
+++ b/runtime/reference_table_test.cc
@@ -106,4 +106,77 @@ TEST_F(ReferenceTableTest, Basics) {
   }
 }
 
+static std::vector<size_t> FindAll(const std::string& haystack, const char* needle) {
+  std::vector<size_t> res;
+  size_t start = 0;
+  do {
+    size_t pos = haystack.find(needle, start);
+    if (pos == std::string::npos) {
+      break;
+    }
+    res.push_back(pos);
+    start = pos + 1;
+  } while (start < haystack.size());
+  return res;
+}
+
+TEST_F(ReferenceTableTest, SummaryOrder) {
+  // Check that the summary statistics are sorted.
+  ScopedObjectAccess soa(Thread::Current());
+
+  ReferenceTable rt("test", 0, 20);
+
+  {
+    mirror::Object* s1 = mirror::String::AllocFromModifiedUtf8(soa.Self(), "hello");
+    mirror::Object* s2 = mirror::String::AllocFromModifiedUtf8(soa.Self(), "world");
+
+    // 3 copies of s1, 2 copies of s2, interleaved.
+    for (size_t i = 0; i != 2; ++i) {
+      rt.Add(s1);
+      rt.Add(s2);
+    }
+    rt.Add(s1);
+  }
+
+  {
+    // Differently sized byte arrays. Should be sorted by identical (non-unique cound).
+    mirror::Object* b1_1 = mirror::ByteArray::Alloc(soa.Self(), 1);
+    rt.Add(b1_1);
+    rt.Add(mirror::ByteArray::Alloc(soa.Self(), 2));
+    rt.Add(b1_1);
+    rt.Add(mirror::ByteArray::Alloc(soa.Self(), 2));
+    rt.Add(mirror::ByteArray::Alloc(soa.Self(), 1));
+    rt.Add(mirror::ByteArray::Alloc(soa.Self(), 2));
+  }
+
+  rt.Add(mirror::CharArray::Alloc(soa.Self(), 0));
+
+  // Now dump, and ensure order.
+  std::ostringstream oss;
+  rt.Dump(oss);
+
+  // Only do this on the part after Summary.
+  std::string base = oss.str();
+  size_t summary_pos = base.find("Summary:");
+  ASSERT_NE(summary_pos, std::string::npos);
+
+  std::string haystack = base.substr(summary_pos);
+
+  std::vector<size_t> strCounts = FindAll(haystack, "java.lang.String");
+  std::vector<size_t> b1Counts = FindAll(haystack, "byte[] (1 elements)");
+  std::vector<size_t> b2Counts = FindAll(haystack, "byte[] (2 elements)");
+  std::vector<size_t> cCounts = FindAll(haystack, "char[]");
+
+  // Only one each.
+  EXPECT_EQ(1u, strCounts.size());
+  EXPECT_EQ(1u, b1Counts.size());
+  EXPECT_EQ(1u, b2Counts.size());
+  EXPECT_EQ(1u, cCounts.size());
+
+  // Expect them to be in order.
+  EXPECT_LT(strCounts[0], b1Counts[0]);
+  EXPECT_LT(b1Counts[0], b2Counts[0]);
+  EXPECT_LT(b2Counts[0], cCounts[0]);
+}
+
 }  // namespace art
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index f4c28b9a5..6d0d6ed7f 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -583,24 +583,6 @@ bool Runtime::Start() {
 
   started_ = true;
 
-  // Create the JIT either if we have to use JIT compilation or save profiling info.
-  // TODO(calin): We use the JIT class as a proxy for JIT compilation and for
-  // recoding profiles. Maybe we should consider changing the name to be more clear it's
-  // not only about compiling. b/28295073.
-  if (jit_options_->UseJitCompilation() || jit_options_->GetSaveProfilingInfo()) {
-    std::string error_msg;
-    if (!IsZygote()) {
-    // If we are the zygote then we need to wait until after forking to create the code cache
-    // due to SELinux restrictions on r/w/x memory regions.
-      CreateJit();
-    } else if (jit_options_->UseJitCompilation()) {
-      if (!jit::Jit::LoadCompilerLibrary(&error_msg)) {
-        // Try to load compiler pre zygote to reduce PSS. b/27744947
-        LOG(WARNING) << "Failed to load JIT compiler with error " << error_msg;
-      }
-    }
-  }
-
   if (!IsImageDex2OatEnabled() || !GetHeap()->HasBootImageSpace()) {
     ScopedObjectAccess soa(self);
     StackHandleScope<2> hs(soa.Self());
@@ -625,6 +607,27 @@ bool Runtime::Start() {
 
   Thread::FinishStartup();
 
+  // Create the JIT either if we have to use JIT compilation or save profiling info. This is
+  // done after FinishStartup as the JIT pool needs Java thread peers, which require the main
+  // ThreadGroup to exist.
+  //
+  // TODO(calin): We use the JIT class as a proxy for JIT compilation and for
+  // recoding profiles. Maybe we should consider changing the name to be more clear it's
+  // not only about compiling. b/28295073.
+  if (jit_options_->UseJitCompilation() || jit_options_->GetSaveProfilingInfo()) {
+    std::string error_msg;
+    if (!IsZygote()) {
+    // If we are the zygote then we need to wait until after forking to create the code cache
+    // due to SELinux restrictions on r/w/x memory regions.
+      CreateJit();
+    } else if (jit_options_->UseJitCompilation()) {
+      if (!jit::Jit::LoadCompilerLibrary(&error_msg)) {
+        // Try to load compiler pre zygote to reduce PSS. b/27744947
+        LOG(WARNING) << "Failed to load JIT compiler with error " << error_msg;
+      }
+    }
+  }
+
   system_class_loader_ = CreateSystemClassLoader(this);
 
   if (is_zygote_) {
@@ -1146,6 +1149,8 @@ bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
   CHECK_EQ(self->GetThreadId(), ThreadList::kMainThreadId);
   CHECK(self != nullptr);
 
+  self->SetCanCallIntoJava(!IsAotCompiler());
+
   // Set us to runnable so tools using a runtime can allocate and GC by default
   self->TransitionFromSuspendedToRunnable();
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 3b72aa7b3..f0cbed67c 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -105,9 +105,7 @@ enum VisitRootFlags : uint8_t {
   kVisitRootFlagStartLoggingNewRoots = 0x4,
   kVisitRootFlagStopLoggingNewRoots = 0x8,
   kVisitRootFlagClearRootLog = 0x10,
-  // Non moving means we can have optimizations where we don't visit some roots if they are
-  // definitely reachable from another location. E.g. ArtMethod and ArtField roots.
-  kVisitRootFlagNonMoving = 0x20,
+  kVisitRootFlagClassLoader = 0x20,
 };
 
 class Runtime {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 16f30cded..f6b89bc53 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -546,27 +546,40 @@ void Thread::InstallImplicitProtection() {
   //
   // We map in the stack by reading every page from the stack bottom (highest address)
   // to the stack top. (We then madvise this away.) This must be done by reading from the
-  // current stack pointer downwards. Any access more than a page below the current SP
-  // might cause a segv.
-  // TODO: This comment may be out of date. It seems possible to speed this up. As
-  //       this is normally done once in the zygote on startup, ignore for now.
+  // current stack pointer downwards.
   //
-  // AddressSanitizer does not like the part of this functions that reads every stack page.
-  // Looks a lot like an out-of-bounds access.
+  // Accesses too far below the current machine register corresponding to the stack pointer (e.g.,
+  // ESP on x86[-32], SP on ARM) might cause a SIGSEGV (at least on x86 with newer kernels). We
+  // thus have to move the stack pointer. We do this portably by using a recursive function with a
+  // large stack frame size.
 
-  // (Defensively) first remove the protection on the protected region as will want to read
+  // (Defensively) first remove the protection on the protected region as we'll want to read
   // and write it. Ignore errors.
   UnprotectStack();
 
   VLOG(threads) << "Need to map in stack for thread at " << std::hex <<
       static_cast<void*>(pregion);
 
-  // Read every page from the high address to the low.
-  volatile uint8_t dont_optimize_this;
-  UNUSED(dont_optimize_this);
-  for (uint8_t* p = stack_top; p >= pregion; p -= kPageSize) {
-    dont_optimize_this = *p;
-  }
+  struct RecurseDownStack {
+    // This function has an intentionally large stack size.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+    NO_INLINE
+    static void Touch(uintptr_t target) {
+      volatile size_t zero = 0;
+      // Use a large local volatile array to ensure a large frame size. Do not use anything close
+      // to a full page for ASAN. It would be nice to ensure the frame size is at most a page, but
+      // there is no pragma support for this.
+      volatile char space[kPageSize - 256];
+      char sink ATTRIBUTE_UNUSED = space[zero];
+      if (reinterpret_cast<uintptr_t>(space) >= target + kPageSize) {
+        Touch(target);
+      }
+      zero *= 2;  // Try to avoid tail recursion.
+    }
+#pragma GCC diagnostic pop
+  };
+  RecurseDownStack::Touch(reinterpret_cast<uintptr_t>(pregion));
 
   VLOG(threads) << "(again) installing stack protected region at " << std::hex <<
       static_cast<void*>(pregion) << " to " <<
@@ -1602,7 +1615,11 @@ void Thread::Shutdown() {
   }
 }
 
-Thread::Thread(bool daemon) : tls32_(daemon), wait_monitor_(nullptr), interrupted_(false) {
+Thread::Thread(bool daemon)
+    : tls32_(daemon),
+      wait_monitor_(nullptr),
+      interrupted_(false),
+      can_call_into_java_(true) {
   wait_mutex_ = new Mutex("a thread wait mutex");
   wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
   tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
diff --git a/runtime/thread.h b/runtime/thread.h
index 582a0cdbd..4c284247a 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -864,6 +864,15 @@ class Thread {
     --tls32_.disable_thread_flip_count;
   }
 
+  // Returns true if the thread is allowed to call into java.
+  bool CanCallIntoJava() const {
+    return can_call_into_java_;
+  }
+
+  void SetCanCallIntoJava(bool can_call_into_java) {
+    can_call_into_java_ = can_call_into_java;
+  }
+
   // Activates single step control for debugging. The thread takes the
   // ownership of the given SingleStepControl*. It is deleted by a call
   // to DeactivateSingleStepControl or upon thread destruction.
@@ -1518,6 +1527,10 @@ class Thread {
   // Debug disable read barrier count, only is checked for debug builds and only in the runtime.
   uint8_t debug_disallow_read_barrier_ = 0;
 
+  // True if the thread is allowed to call back into java (for e.g. during class resolution).
+  // By default this is true.
+  bool can_call_into_java_;
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
   friend class Runtime;  // For CreatePeer.
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index 2fba805fa..c03fb2eea 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -84,7 +84,12 @@ void ThreadPoolWorker::Run() {
 void* ThreadPoolWorker::Callback(void* arg) {
   ThreadPoolWorker* worker = reinterpret_cast<ThreadPoolWorker*>(arg);
   Runtime* runtime = Runtime::Current();
-  CHECK(runtime->AttachCurrentThread(worker->name_.c_str(), true, nullptr, false));
+  CHECK(runtime->AttachCurrentThread(worker->name_.c_str(),
+                                     true,
+                                     nullptr,
+                                     worker->thread_pool_->create_peers_));
+  // Thread pool workers cannot call into java.
+  Thread::Current()->SetCanCallIntoJava(false);
   // Do work until its time to shut down.
   worker->Run();
   runtime->DetachCurrentThread();
@@ -105,7 +110,7 @@ void ThreadPool::RemoveAllTasks(Thread* self) {
   tasks_.clear();
 }
 
-ThreadPool::ThreadPool(const char* name, size_t num_threads)
+ThreadPool::ThreadPool(const char* name, size_t num_threads, bool create_peers)
   : name_(name),
     task_queue_lock_("task queue lock"),
     task_queue_condition_("task queue condition", task_queue_lock_),
@@ -117,7 +122,8 @@ ThreadPool::ThreadPool(const char* name, size_t num_threads)
     total_wait_time_(0),
     // Add one since the caller of constructor waits on the barrier too.
     creation_barier_(num_threads + 1),
-    max_active_workers_(num_threads) {
+    max_active_workers_(num_threads),
+    create_peers_(create_peers) {
   Thread* self = Thread::Current();
   while (GetThreadCount() < num_threads) {
     const std::string worker_name = StringPrintf("%s worker thread %zu", name_.c_str(),
@@ -210,6 +216,7 @@ Task* ThreadPool::TryGetTaskLocked() {
 
 void ThreadPool::Wait(Thread* self, bool do_work, bool may_hold_locks) {
   if (do_work) {
+    CHECK(!create_peers_);
     Task* task = nullptr;
     while ((task = TryGetTask(self)) != nullptr) {
       task->Run(self);
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index b6c6f02db..739ac66ec 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -77,6 +77,7 @@ class ThreadPoolWorker {
   DISALLOW_COPY_AND_ASSIGN(ThreadPoolWorker);
 };
 
+// Note that thread pool workers will set Thread#setCanCallIntoJava to false.
 class ThreadPool {
  public:
   // Returns the number of threads in the thread pool.
@@ -97,10 +98,16 @@ class ThreadPool {
   // Remove all tasks in the queue.
   void RemoveAllTasks(Thread* self) REQUIRES(!task_queue_lock_);
 
-  ThreadPool(const char* name, size_t num_threads);
+  // Create a named thread pool with the given number of threads.
+  //
+  // If create_peers is true, all worker threads will have a Java peer object. Note that if the
+  // pool is asked to do work on the current thread (see Wait), a peer may not be available. Wait
+  // will conservatively abort if create_peers and do_work are true.
+  ThreadPool(const char* name, size_t num_threads, bool create_peers = false);
   virtual ~ThreadPool();
 
   // Wait for all tasks currently on queue to get completed.
+  // When the pool was created with peers for workers, do_work must not be true (see ThreadPool()).
   void Wait(Thread* self, bool do_work, bool may_hold_locks) REQUIRES(!task_queue_lock_);
 
   size_t GetTaskCount(Thread* self) REQUIRES(!task_queue_lock_);
@@ -146,6 +153,7 @@ class ThreadPool {
   uint64_t total_wait_time_;
   Barrier creation_barier_;
   size_t max_active_workers_ GUARDED_BY(task_queue_lock_);
+  const bool create_peers_;
 
  private:
   friend class ThreadPoolWorker;
diff --git a/runtime/thread_pool_test.cc b/runtime/thread_pool_test.cc
index d5f17d17b..23d04d195 100644
--- a/runtime/thread_pool_test.cc
+++ b/runtime/thread_pool_test.cc
@@ -20,6 +20,7 @@
 
 #include "atomic.h"
 #include "common_runtime_test.h"
+#include "scoped_thread_state_change.h"
 #include "thread-inl.h"
 
 namespace art {
@@ -136,4 +137,55 @@ TEST_F(ThreadPoolTest, RecursiveTest) {
   EXPECT_EQ((1 << depth) - 1, count.LoadSequentiallyConsistent());
 }
 
+class PeerTask : public Task {
+ public:
+  PeerTask() {}
+
+  void Run(Thread* self) {
+    ScopedObjectAccess soa(self);
+    CHECK(self->GetPeer() != nullptr);
+  }
+
+  void Finalize() {
+    delete this;
+  }
+};
+
+class NoPeerTask : public Task {
+ public:
+  NoPeerTask() {}
+
+  void Run(Thread* self) {
+    ScopedObjectAccess soa(self);
+    CHECK(self->GetPeer() == nullptr);
+  }
+
+  void Finalize() {
+    delete this;
+  }
+};
+
+// Tests for create_peer functionality.
+TEST_F(ThreadPoolTest, PeerTest) {
+  Thread* self = Thread::Current();
+  {
+    ThreadPool thread_pool("Thread pool test thread pool", 1);
+    thread_pool.AddTask(self, new NoPeerTask());
+    thread_pool.StartWorkers(self);
+    thread_pool.Wait(self, false, false);
+  }
+
+  {
+    // To create peers, the runtime needs to be started.
+    self->TransitionFromSuspendedToRunnable();
+    bool started = runtime_->Start();
+    ASSERT_TRUE(started);
+
+    ThreadPool thread_pool("Thread pool test thread pool", 1, true);
+    thread_pool.AddTask(self, new PeerTask());
+    thread_pool.StartWorkers(self);
+    thread_pool.Wait(self, false, false);
+  }
+}
+
 }  // namespace art
diff --git a/test/141-class-unload/expected.txt b/test/141-class-unload/expected.txt
index 11de660c4..ae5f51186 100644
--- a/test/141-class-unload/expected.txt
+++ b/test/141-class-unload/expected.txt
@@ -22,3 +22,4 @@ JNI_OnLoad called
 class null false test
 JNI_OnUnload called
 Number of loaded unload-ex maps 0
+Too small false
diff --git a/test/141-class-unload/src/Main.java b/test/141-class-unload/src/Main.java
index 17a6049db..3a05302fd 100644
--- a/test/141-class-unload/src/Main.java
+++ b/test/141-class-unload/src/Main.java
@@ -49,6 +49,8 @@ public static void main(String[] args) throws Exception {
             stressTest(constructor);
             // Test that the oat files are unloaded.
             testOatFilesUnloaded(getPid());
+            // Test that objects keep class loader live for sticky GC.
+            testStickyUnload(constructor);
         } catch (Exception e) {
             e.printStackTrace();
         }
@@ -152,6 +154,30 @@ private static WeakReference<Class> setUpUnloadClass(Constructor constructor) th
         return new WeakReference(intHolder);
     }
 
+    private static Object allocObjectInOtherClassLoader(Constructor<?> constructor)
+            throws Exception {
+      ClassLoader loader = (ClassLoader) constructor.newInstance(
+              DEX_FILE, LIBRARY_SEARCH_PATH, ClassLoader.getSystemClassLoader());
+      return loader.loadClass("IntHolder").newInstance();
+    }
+
+    // Regression test for public issue 227182.
+    private static void testStickyUnload(Constructor<?> constructor) throws Exception {
+        String s = "";
+        for (int i = 0; i < 10; ++i) {
+            s = "";
+            // The object is the only thing preventing the class loader from being unloaded.
+            Object o = allocObjectInOtherClassLoader(constructor);
+            for (int j = 0; j < 1000; ++j) {
+                s += j + " ";
+            }
+            // Make sure the object still has a valid class (hasn't been incorrectly unloaded).
+            s += o.getClass().getName();
+            o = null;
+        }
+        System.out.println("Too small " + (s.length() < 1000));
+    }
+
     private static WeakReference<ClassLoader> setUpUnloadLoader(Constructor constructor,
                                                                 boolean waitForCompilation)
         throws Exception {