From bc1a8722f6ade50fae6162708cea7c0b6b5d23fe Mon Sep 17 00:00:00 2001
From: Aman Khalid <t-amankhalid@microsoft.com>
Date: Wed, 22 Jun 2022 13:53:14 -0700
Subject: [PATCH] Enable fake hot/cold splitting on ARM64 (#70708)

This commit contains fixes for various bugs exposed by enabling fake
hot/cold splitting on ARM64:
- Branches between hot/cold sections are now always long.
- The pseudoinstruction for loading a constant from the cold section
did not support loading 16-byte data into vector registers, as it
temporarily loaded the constant into an 8-byte integer register. Now,
16-byte constants are loaded directly into vector registers via an
`ld1` instruction.
- Asserts/NYIs blocking hot/cold splitting on ARM64 have been removed.

Fake hot/cold splitting requires we fake unwind info by treating each
split function as one hot section. A more architecture-agnostic
approach for this has been applied. To facilitate this approach, the
fake-splitting implementation has been revised to place the hot
and cold sections contiguously in memory (immediately followed
by the read-only data section on ARM64).
---
 src/coreclr/jit/compiler.cpp    |   6 +-
 src/coreclr/jit/compiler.h      |   6 +-
 src/coreclr/jit/ee_il_dll.cpp   |  62 ++++++--
 src/coreclr/jit/emit.cpp        |  39 +----
 src/coreclr/jit/emit.h          |   2 +-
 src/coreclr/jit/emitarm64.cpp   | 264 +++++++++++++++++++++-----------
 src/coreclr/jit/unwind.cpp      |  18 ++-
 src/coreclr/jit/unwindamd64.cpp |  67 +++-----
 src/coreclr/jit/unwindarm.cpp   |  45 ++++--
 src/coreclr/jit/unwindx86.cpp   |  63 +++-----
 10 files changed, 330 insertions(+), 242 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 0ac148774aaf6..460412d84597e 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -3199,10 +3199,10 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
     opts.compProcedureSplitting = jitFlags->IsSet(JitFlags::JIT_FLAG_PROCSPLIT) || enableFakeSplitting;
 
-#ifdef TARGET_ARM64
-    // TODO-ARM64-NYI: enable hot/cold splitting
+#ifdef TARGET_LOONGARCH64
+    // Hot/cold splitting is not being tested on LoongArch64.
     opts.compProcedureSplitting = false;
-#endif // TARGET_ARM64
+#endif // TARGET_LOONGARCH64
 
 #ifdef DEBUG
     opts.compProcedureSplittingEH = opts.compProcedureSplitting;
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index df9fcec6a78ab..b5bfdf926f212 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -7660,7 +7660,7 @@ class Compiler
 
     // ICorJitInfo wrappers
 
-    void eeAllocMem(AllocMemArgs* args);
+    void eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment);
 
     void eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize);
 
@@ -8017,10 +8017,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     void unwindReserveFuncHelper(FuncInfoDsc* func, bool isHotCode);
     void unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pColdCode, bool isHotCode);
 
-#ifdef DEBUG
-    void fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode);
-#endif // DEBUG
-
 #endif // TARGET_AMD64 || (TARGET_X86 && FEATURE_EH_FUNCLETS)
 
     UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func);
diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp
index f8c437e326694..d09bffa0a5e9a 100644
--- a/src/coreclr/jit/ee_il_dll.cpp
+++ b/src/coreclr/jit/ee_il_dll.cpp
@@ -1122,34 +1122,64 @@ void Compiler::eeDispLineInfos()
  * (e.g., host AMD64, target ARM64), then VM will get confused anyway.
  */
 
-void Compiler::eeAllocMem(AllocMemArgs* args)
+void Compiler::eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment)
 {
 #ifdef DEBUG
-    const UNATIVE_OFFSET hotSizeRequest  = args->hotCodeSize;
-    const UNATIVE_OFFSET coldSizeRequest = args->coldCodeSize;
 
-    // Fake splitting implementation: place hot/cold code in contiguous section
-    if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
+    // Fake splitting implementation: place hot/cold code in contiguous section.
+    UNATIVE_OFFSET coldCodeOffset = 0;
+    if (JitConfig.JitFakeProcedureSplitting() && (args->coldCodeSize > 0))
     {
-        args->hotCodeSize  = hotSizeRequest + coldSizeRequest;
+        coldCodeOffset = args->hotCodeSize;
+        assert(coldCodeOffset > 0);
+        args->hotCodeSize += args->coldCodeSize;
         args->coldCodeSize = 0;
     }
-#endif
+
+#endif // DEBUG
+
+#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
+
+    // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
+    // This way allows us to use a single `ldr` to access such data like float constant/jmp table.
+    // For LoongArch64 using `pcaddi + ld` to access such data.
+
+    UNATIVE_OFFSET roDataAlignmentDelta = 0;
+    if (args->roDataSize > 0)
+    {
+        roDataAlignmentDelta = AlignmentPad(args->hotCodeSize, roDataSectionAlignment);
+    }
+
+    const UNATIVE_OFFSET roDataOffset = args->hotCodeSize + roDataAlignmentDelta;
+    args->hotCodeSize                 = roDataOffset + args->roDataSize;
+    args->roDataSize                  = 0;
+
+#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
 
     info.compCompHnd->allocMem(args);
 
 #ifdef DEBUG
-    if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
-    {
-        // Fix up hot/cold code pointers
-        args->coldCodeBlock   = ((BYTE*)args->hotCodeBlock) + hotSizeRequest;
-        args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + hotSizeRequest;
 
-        // Reset args' hot/cold code sizes in case caller reads them later
-        args->hotCodeSize  = hotSizeRequest;
-        args->coldCodeSize = coldSizeRequest;
+    if (JitConfig.JitFakeProcedureSplitting() && (coldCodeOffset > 0))
+    {
+        // Fix up cold code pointers. Cold section is adjacent to hot section.
+        assert(args->coldCodeBlock == nullptr);
+        assert(args->coldCodeBlockRW == nullptr);
+        args->coldCodeBlock   = ((BYTE*)args->hotCodeBlock) + coldCodeOffset;
+        args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + coldCodeOffset;
     }
-#endif
+
+#endif // DEBUG
+
+#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
+
+    // Fix up data section pointers.
+    assert(args->roDataBlock == nullptr);
+    assert(args->roDataBlockRW == nullptr);
+    args->roDataBlock   = ((BYTE*)args->hotCodeBlock) + roDataOffset;
+    args->roDataBlockRW = ((BYTE*)args->hotCodeBlockRW) + roDataOffset;
+
+#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
 }
 
 void Compiler::eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize)
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index ac9bd8121aa4f..eb846e06a0dc6 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4561,7 +4561,6 @@ void emitter::emitJumpDistBind()
         else if (emitIsUncondJump(jmp))
         {
             // Nothing to do; we don't shrink these.
-            assert(jmp->idjShort);
             ssz = JMP_SIZE_SMALL;
         }
         else if (emitIsLoadLabel(jmp))
@@ -6350,47 +6349,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     AllocMemArgs args;
     memset(&args, 0, sizeof(args));
 
-#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
-    // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
-    // This way allows us to use a single `ldr` to access such data like float constant/jmp table.
-    // For LoongArch64 using `pcaddi + ld` to access such data.
-    if (emitTotalColdCodeSize > 0)
-    {
-        // JIT data might be far away from the cold code.
-        NYI("Need to handle fix-up to data from cold code.");
-    }
-
-    UNATIVE_OFFSET roDataAlignmentDelta = 0;
-    if (emitConsDsc.dsdOffs > 0)
-    {
-        roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, dataAlignment);
-    }
-
-    args.hotCodeSize  = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs;
-    args.coldCodeSize = emitTotalColdCodeSize;
-    args.roDataSize   = 0;
-    args.xcptnsCount  = xcptnsCount;
-    args.flag         = allocMemFlag;
-
-    emitComp->eeAllocMem(&args);
-
-    codeBlock       = (BYTE*)args.hotCodeBlock;
-    codeBlockRW     = (BYTE*)args.hotCodeBlockRW;
-    coldCodeBlock   = (BYTE*)args.coldCodeBlock;
-    coldCodeBlockRW = (BYTE*)args.coldCodeBlockRW;
-
-    consBlock   = codeBlock + emitTotalHotCodeSize + roDataAlignmentDelta;
-    consBlockRW = codeBlockRW + emitTotalHotCodeSize + roDataAlignmentDelta;
-
-#else
-
     args.hotCodeSize  = emitTotalHotCodeSize;
     args.coldCodeSize = emitTotalColdCodeSize;
     args.roDataSize   = emitConsDsc.dsdOffs;
     args.xcptnsCount  = xcptnsCount;
     args.flag         = allocMemFlag;
 
-    emitComp->eeAllocMem(&args);
+    emitComp->eeAllocMem(&args, emitConsDsc.alignment);
 
     codeBlock       = (BYTE*)args.hotCodeBlock;
     codeBlockRW     = (BYTE*)args.hotCodeBlockRW;
@@ -6399,8 +6364,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     consBlock       = (BYTE*)args.roDataBlock;
     consBlockRW     = (BYTE*)args.roDataBlockRW;
 
-#endif
-
 #ifdef DEBUG
     if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
     {
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 053381eb8f027..d44fd1bd572ee 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -997,7 +997,7 @@ class emitter
                 case IF_LARGELDC:
                     if (isVectorRegister(idReg1()))
                     {
-                        // adrp + ldr + fmov
+                        // (adrp + ldr + fmov) or (adrp + add + ld1)
                         size = 12;
                     }
                     else
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index e60d8d6a69ee7..b0bf1769d1bd0 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -8438,10 +8438,12 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount)
     switch (ins)
     {
         case INS_bl_local:
+            idjShort = true;
+            FALLTHROUGH;
         case INS_b:
             // Unconditional jump is a single form.
-            idjShort = true;
-            fmt      = IF_BI_0A;
+            // Assume is long in case we cross hot/cold sections.
+            fmt = IF_BI_0A;
             break;
 
         case INS_beq:
@@ -8486,7 +8488,6 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount)
         id->idAddr()->iiaBBlabel = dst;
 
         // Skip unconditional jump that has a single form.
-        // TODO-ARM64-NYI: enable hot/cold splittingNYI.
         // The target needs to be relocated.
         if (!idjShort)
         {
@@ -9816,38 +9817,67 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
                 {
                     // Update addrReg with the reserved integer register
                     // since we cannot use dstReg (vector) to load constant directly from memory.
-                    addrReg = id->idReg2();
+
+                    // If loading a 16-byte value, we will need to load directly into dstReg.
+                    // Thus, encode addrReg for the ld1 instruction.
+                    if (opSize == EA_16BYTE)
+                    {
+                        addrReg = encodingSPtoZR(id->idReg2());
+                    }
+                    else
+                    {
+                        addrReg = id->idReg2();
+                    }
+
                     assert(isGeneralRegister(addrReg));
                 }
+
                 ins = INS_adrp;
                 fmt = IF_DI_1E;
                 dst = emitOutputShortAddress(dst, ins, fmt, relPageAddr, addrReg);
 
-                // ldr x, [x, page offs] -- load constant from page address + page offset into integer register.
                 ssize_t imm12 = (ssize_t)dstAddr & 0xFFF; // 12 bits
                 assert(isValidUimm12(imm12));
-                ins = INS_ldr;
-                fmt = IF_LS_2B;
-                dst = emitOutputShortConstant(dst, ins, fmt, imm12, addrReg, opSize);
 
-                // fmov v, d -- copy constant in integer register to vector register.
-                // This is needed only for vector constant.
-                if (addrReg != dstReg)
+                // Special case: emit add + ld1 instructions for loading 16-byte data into vector register.
+                if (isVectorRegister(dstReg) && (opSize == EA_16BYTE))
                 {
-                    //  fmov    Vd,Rn                DV_2I  X00111100X100111 000000nnnnnddddd   1E27 0000   Vd,Rn
-                    //  (scalar, from general)
-                    assert(isVectorRegister(dstReg) && isGeneralRegister(addrReg));
-                    ins         = INS_fmov;
-                    fmt         = IF_DV_2I;
-                    code_t code = emitInsCode(ins, fmt);
+                    const emitAttr elemSize = EA_1BYTE;
+                    const insOpts  opt      = optMakeArrangement(opSize, elemSize);
 
-                    code |= insEncodeReg_Vd(dstReg);  // ddddd
-                    code |= insEncodeReg_Rn(addrReg); // nnnnn
-                    if (id->idOpSize() == EA_8BYTE)
+                    assert(isGeneralRegisterOrSP(addrReg));
+                    assert(isValidVectorElemsize(elemSize));
+                    assert(isValidArrangement(opSize, opt));
+
+                    // Calculate page addr + page offs, then emit ld1 instruction.
+                    dst = emitOutputVectorConstant(dst, imm12, dstReg, addrReg, opSize, elemSize);
+                }
+                else
+                {
+                    // ldr x, [x, 0] -- load constant from address into integer register.
+                    ins = INS_ldr;
+                    fmt = IF_LS_2B;
+                    dst = emitOutputShortConstant(dst, ins, fmt, imm12, addrReg, opSize);
+
+                    // fmov v, d -- copy constant in integer register to vector register.
+                    // This is needed only for vector constant.
+                    if (addrReg != dstReg)
                     {
-                        code |= 0x80400000; // X ... X
+                        //  fmov    Vd,Rn                DV_2I  X00111100X100111 000000nnnnnddddd   1E27 0000   Vd,Rn
+                        //  (scalar, from general)
+                        assert(isVectorRegister(dstReg) && isGeneralRegister(addrReg));
+                        ins         = INS_fmov;
+                        fmt         = IF_DV_2I;
+                        code_t code = emitInsCode(ins, fmt);
+
+                        code |= insEncodeReg_Vd(dstReg);  // ddddd
+                        code |= insEncodeReg_Rn(addrReg); // nnnnn
+                        if (id->idOpSize() == EA_8BYTE)
+                        {
+                            code |= 0x80400000; // X ... X
+                        }
+                        dst += emitOutput_Instr(dst, code);
                     }
-                    dst += emitOutput_Instr(dst, code);
                 }
             }
         }
@@ -9950,12 +9980,6 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
     /* For forward jumps, record the address of the distance value */
     id->idjTemp.idjAddr = (distVal > 0) ? dst : NULL;
 
-    if (emitJumpCrossHotColdBoundary(srcOffs, dstOffs))
-    {
-        assert(!id->idjShort);
-        NYI_ARM64("Relocation Support for long address");
-    }
-
     assert(insOptsNone(id->idInsOpt()));
 
     if (isJump)
@@ -9966,75 +9990,114 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
             assert(!id->idjKeepLong);
             assert(emitJumpCrossHotColdBoundary(srcOffs, dstOffs) == false);
             assert((fmt == IF_BI_0A) || (fmt == IF_BI_0B) || (fmt == IF_BI_1A) || (fmt == IF_BI_1B));
+            dst = emitOutputShortBranch(dst, ins, fmt, distVal, id);
         }
         else
         {
-            // Long conditional jump
-            assert(fmt == IF_LARGEJMP);
-            // This is a pseudo-instruction format representing a large conditional branch, to allow
-            // us to get a greater branch target range than we can get by using a straightforward conditional
-            // branch. It is encoded as a short conditional branch that branches around a long unconditional
-            // branch.
-            //
-            // Conceptually, we have:
-            //
-            //      b<cond> L_target
-            //
-            // The code we emit is:
-            //
-            //      b<!cond> L_not  // 4 bytes. Note that we reverse the condition.
-            //      b L_target      // 4 bytes
-            //   L_not:
-            //
-            // Note that we don't actually insert any blocks: we simply encode "b <!cond> L_not" as a branch with
-            // the correct offset. Note also that this works for both integer and floating-point conditions, because
-            // the condition inversion takes ordered/unordered into account, preserving NaN behavior. For example,
-            // "GT" (greater than) is inverted to "LE" (less than, equal, or unordered).
+            // Long conditional/unconditional jump
 
-            instruction reverseIns;
-            insFormat   reverseFmt;
+            if (fmt == IF_LARGEJMP)
+            {
+                // This is a pseudo-instruction format representing a large conditional branch, to allow
+                // us to get a greater branch target range than we can get by using a straightforward conditional
+                // branch. It is encoded as a short conditional branch that branches around a long unconditional
+                // branch.
+                //
+                // Conceptually, we have:
+                //
+                //      b<cond> L_target
+                //
+                // The code we emit is:
+                //
+                //      b<!cond> L_not  // 4 bytes. Note that we reverse the condition.
+                //      b L_target      // 4 bytes
+                //   L_not:
+                //
+                // Note that we don't actually insert any blocks: we simply encode "b <!cond> L_not" as a branch with
+                // the correct offset. Note also that this works for both integer and floating-point conditions, because
+                // the condition inversion takes ordered/unordered into account, preserving NaN behavior. For example,
+                // "GT" (greater than) is inverted to "LE" (less than, equal, or unordered).
 
-            switch (ins)
+                instruction reverseIns;
+                insFormat   reverseFmt;
+
+                switch (ins)
+                {
+                    case INS_cbz:
+                        reverseIns = INS_cbnz;
+                        reverseFmt = IF_BI_1A;
+                        break;
+                    case INS_cbnz:
+                        reverseIns = INS_cbz;
+                        reverseFmt = IF_BI_1A;
+                        break;
+                    case INS_tbz:
+                        reverseIns = INS_tbnz;
+                        reverseFmt = IF_BI_1B;
+                        break;
+                    case INS_tbnz:
+                        reverseIns = INS_tbz;
+                        reverseFmt = IF_BI_1B;
+                        break;
+                    default:
+                        reverseIns = emitJumpKindToIns(emitReverseJumpKind(emitInsToJumpKind(ins)));
+                        reverseFmt = IF_BI_0B;
+                }
+
+                dst = emitOutputShortBranch(dst,
+                                            reverseIns,    // reverse the conditional instruction
+                                            reverseFmt, 8, /* 8 bytes from start of this large conditional
+                                                              pseudo-instruction to L_not. */
+                                            id);
+
+                // Now, pretend we've got a normal unconditional branch, and fall through to the code to emit that.
+                ins = INS_b;
+                fmt = IF_BI_0A;
+
+                // The distVal was computed based on the beginning of the pseudo-instruction,
+                // So subtract the size of the conditional branch so that it is relative to the
+                // unconditional branch.
+                distVal -= 4;
+            }
+
+            assert(fmt == IF_BI_0A);
+            assert((distVal & 1) == 0);
+            code_t     code             = emitInsCode(ins, fmt);
+            const bool recordRelocation = emitComp->opts.compReloc && emitJumpCrossHotColdBoundary(srcOffs, dstOffs);
+
+            if (recordRelocation)
             {
-                case INS_cbz:
-                    reverseIns = INS_cbnz;
-                    reverseFmt = IF_BI_1A;
-                    break;
-                case INS_cbnz:
-                    reverseIns = INS_cbz;
-                    reverseFmt = IF_BI_1A;
-                    break;
-                case INS_tbz:
-                    reverseIns = INS_tbnz;
-                    reverseFmt = IF_BI_1B;
-                    break;
-                case INS_tbnz:
-                    reverseIns = INS_tbz;
-                    reverseFmt = IF_BI_1B;
-                    break;
-                default:
-                    reverseIns = emitJumpKindToIns(emitReverseJumpKind(emitInsToJumpKind(ins)));
-                    reverseFmt = IF_BI_0B;
+                // dst isn't an actual final target location, just some intermediate
+                // location.  Thus we cannot make any guarantees about distVal (not
+                // even the direction/sign).  Instead we don't encode any offset and
+                // rely on the relocation to do all the work
+            }
+            else
+            {
+                // Branch offset encodings are scaled by 4.
+                noway_assert((distVal & 3) == 0);
+                distVal >>= 2;
+                noway_assert(isValidSimm26(distVal));
+
+                // Insert offset into unconditional branch instruction
+                distVal &= 0x3FFFFFFLL;
+                code |= distVal;
             }
 
-            dst =
-                emitOutputShortBranch(dst,
-                                      reverseIns, // reverse the conditional instruction
-                                      reverseFmt,
-                                      8, /* 8 bytes from start of this large conditional pseudo-instruction to L_not. */
-                                      id);
+            const unsigned instrSize = emitOutput_Instr(dst, code);
 
-            // Now, pretend we've got a normal unconditional branch, and fall through to the code to emit that.
-            ins = INS_b;
-            fmt = IF_BI_0A;
+            if (recordRelocation)
+            {
+                assert(id->idjKeepLong);
+                if (emitComp->info.compMatchedVM)
+                {
+                    void* target = emitOffsetToPtr(dstOffs);
+                    emitRecordRelocation((void*)dst, target, IMAGE_REL_ARM64_BRANCH26);
+                }
+            }
 
-            // The distVal was computed based on the beginning of the pseudo-instruction,
-            // So subtract the size of the conditional branch so that it is relative to the
-            // unconditional branch.
-            distVal -= 4;
+            dst += instrSize;
         }
-
-        dst = emitOutputShortBranch(dst, ins, fmt, distVal, id);
     }
     else if (loadLabel)
     {
@@ -10155,7 +10218,7 @@ BYTE* emitter::emitOutputShortConstant(
 
         ssize_t loBits = (imm & 3);
         noway_assert(loBits == 0);
-        ssize_t distVal = imm >>= 2; // load offset encodings are scaled by 4.
+        ssize_t distVal = imm >> 2; // load offset encodings are scaled by 4.
 
         noway_assert(isValidSimm19(distVal));
 
@@ -10223,6 +10286,33 @@ BYTE* emitter::emitOutputShortConstant(
 
     return dst;
 }
+
+/*****************************************************************************
+ *
+ *  Output instructions to load a constant into a vector register.
+ */
+BYTE* emitter::emitOutputVectorConstant(
+    BYTE* dst, ssize_t imm, regNumber dstReg, regNumber addrReg, emitAttr opSize, emitAttr elemSize)
+{
+    // add addrReg, addrReg, page offs -- compute address = page addr + page offs.
+    code_t code = emitInsCode(INS_add, IF_DI_2A); // DI_2A  X0010001shiiiiii iiiiiinnnnnddddd   1100 0000   imm(i12, sh)
+    code |= insEncodeDatasize(EA_8BYTE);          // X - use EA_8BYTE, as we are calculating 64-bit address
+    code |= ((code_t)imm << 10);                  // iiiiiiiiiiii
+    code |= insEncodeReg_Rd(addrReg);             // ddddd
+    code |= insEncodeReg_Rn(addrReg);             // nnnnn
+    dst += emitOutput_Instr(dst, code);
+
+    // ld1 dstReg, addrReg -- load constant at address in addrReg into dstReg.
+    code = emitInsCode(INS_ld1, IF_LS_2D);  // LS_2D   .Q.............. ....ssnnnnnttttt      Vt Rn
+    code |= insEncodeVectorsize(opSize);    // Q
+    code |= insEncodeVLSElemsize(elemSize); // ss
+    code |= insEncodeReg_Rn(addrReg);       // nnnnn
+    code |= insEncodeReg_Vt(dstReg);        // ttttt
+    dst += emitOutput_Instr(dst, code);
+
+    return dst;
+}
+
 /*****************************************************************************
  *
  *  Output a call instruction.
diff --git a/src/coreclr/jit/unwind.cpp b/src/coreclr/jit/unwind.cpp
index 6ad60a064f35c..63c4ed716cf39 100644
--- a/src/coreclr/jit/unwind.cpp
+++ b/src/coreclr/jit/unwind.cpp
@@ -69,7 +69,16 @@ void Compiler::unwindGetFuncLocations(FuncInfoDsc*             func,
                 // The hot section only goes up to the cold section
                 assert(fgFirstFuncletBB == nullptr);
 
-                *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock));
+#ifdef DEBUG
+                if (JitConfig.JitFakeProcedureSplitting())
+                {
+                    *ppEndLoc = nullptr; // If fake-splitting, "trick" VM by pretending entire function is hot.
+                }
+                else
+#endif // DEBUG
+                {
+                    *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock));
+                }
             }
             else
             {
@@ -259,6 +268,13 @@ void Compiler::unwindEmitFuncCFI(FuncInfoDsc* func, void* pHotCode, void* pColdC
     DWORD          unwindCodeBytes = 0;
     BYTE*          pUnwindBlock    = nullptr;
 
+#ifdef DEBUG
+    if (JitConfig.JitFakeProcedureSplitting())
+    {
+        pColdCode = nullptr;
+    }
+#endif // DEBUG
+
     if (func->startLoc == nullptr)
     {
         startOffset = 0;
diff --git a/src/coreclr/jit/unwindamd64.cpp b/src/coreclr/jit/unwindamd64.cpp
index 2c8e90fa5a944..88cefbe31ed5e 100644
--- a/src/coreclr/jit/unwindamd64.cpp
+++ b/src/coreclr/jit/unwindamd64.cpp
@@ -656,18 +656,17 @@ void Compiler::unwindReserve()
 //
 void Compiler::unwindReserveFunc(FuncInfoDsc* func)
 {
-#ifdef DEBUG
-    if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr))
+    unwindReserveFuncHelper(func, true);
+
+    if (fgFirstColdBlock != nullptr)
     {
-        assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets.
-        unwindReserveFuncHelper(func, true);
-    }
-    else
+#ifdef DEBUG
+        if (JitConfig.JitFakeProcedureSplitting())
+        {
+            assert(func->funKind == FUNC_ROOT); // No splitting of funclets.
+        }
+        else
 #endif // DEBUG
-    {
-        unwindReserveFuncHelper(func, true);
-
-        if (fgFirstColdBlock != nullptr)
         {
             unwindReserveFuncHelper(func, false);
         }
@@ -859,7 +858,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo
 
     if (isHotCode)
     {
-        assert(endOffset <= info.compTotalHotCodeSize);
+#ifdef DEBUG
+        if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr))
+        {
+            assert(endOffset <= info.compNativeCodeSize);
+        }
+        else
+#endif // DEBUG
+        {
+            assert(endOffset <= info.compTotalHotCodeSize);
+        }
+
         pColdCode = nullptr;
     }
     else
@@ -890,43 +899,17 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode
     static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER);
     static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER);
 
-#ifdef DEBUG
-    if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr))
+    unwindEmitFuncHelper(func, pHotCode, pColdCode, true);
+
+    if (pColdCode != nullptr)
     {
-        fakeUnwindEmitFuncHelper(func, pHotCode);
-    }
-    else
+#ifdef DEBUG
+        if (!JitConfig.JitFakeProcedureSplitting())
 #endif // DEBUG
-    {
-        unwindEmitFuncHelper(func, pHotCode, pColdCode, true);
-
-        if (pColdCode != nullptr)
         {
             unwindEmitFuncHelper(func, pHotCode, pColdCode, false);
         }
     }
 }
 
-#ifdef DEBUG
-void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode)
-{
-    assert(fgFirstColdBlock != nullptr);
-    assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets.
-
-    const UNATIVE_OFFSET startOffset     = 0;
-    const UNATIVE_OFFSET endOffset       = info.compNativeCodeSize;
-    const DWORD          unwindCodeBytes = sizeof(func->unwindCodes) - func->unwindCodeSlot;
-    BYTE*                pUnwindBlock    = &func->unwindCodes[func->unwindCodeSlot];
-
-    if (opts.dspUnwind)
-    {
-        DumpUnwindInfo(true, startOffset, endOffset, (const UNWIND_INFO* const)pUnwindBlock);
-    }
-
-    // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section
-    eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, unwindCodeBytes, pUnwindBlock,
-                      (CorJitFuncKind)func->funKind);
-}
-#endif // DEBUG
-
 #endif // TARGET_AMD64
diff --git a/src/coreclr/jit/unwindarm.cpp b/src/coreclr/jit/unwindarm.cpp
index 1eb7456250cbb..8a14c6edbb832 100644
--- a/src/coreclr/jit/unwindarm.cpp
+++ b/src/coreclr/jit/unwindarm.cpp
@@ -563,13 +563,20 @@ void Compiler::unwindReserve()
 void Compiler::unwindReserveFunc(FuncInfoDsc* func)
 {
     BOOL isFunclet          = (func->funKind == FUNC_ROOT) ? FALSE : TRUE;
-    bool funcHasColdSection = false;
+    bool funcHasColdSection = (fgFirstColdBlock != nullptr);
+
+#ifdef DEBUG
+    if (JitConfig.JitFakeProcedureSplitting() && funcHasColdSection)
+    {
+        funcHasColdSection = false; // "Trick" the VM into thinking we don't have a cold section.
+    }
+#endif // DEBUG
 
 #if defined(FEATURE_CFI_SUPPORT)
     if (generateCFIUnwindCodes())
     {
         DWORD unwindCodeBytes = 0;
-        if (fgFirstColdBlock != nullptr)
+        if (funcHasColdSection)
         {
             eeReserveUnwindInfo(isFunclet, true /*isColdCode*/, unwindCodeBytes);
         }
@@ -584,7 +591,7 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func)
     // cold section. This needs to be done before we split into fragments, as each
     // of the hot and cold sections can have multiple fragments.
 
-    if (fgFirstColdBlock != NULL)
+    if (funcHasColdSection)
     {
         assert(!isFunclet); // TODO-CQ: support hot/cold splitting with EH
 
@@ -595,8 +602,6 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func)
         func->uwiCold = new (this, CMK_UnwindInfo) UnwindInfo();
         func->uwiCold->InitUnwindInfo(this, startLoc, endLoc);
         func->uwiCold->HotColdSplitCodes(&func->uwi);
-
-        funcHasColdSection = true;
     }
 
     // First we need to split the function or funclet into fragments that are no larger
@@ -1604,11 +1609,19 @@ void UnwindFragmentInfo::Allocate(
     UNATIVE_OFFSET endOffset;
     UNATIVE_OFFSET codeSize;
 
-    // We don't support hot/cold splitting with EH, so if there is cold code, this
-    // better not be a funclet!
-    // TODO-CQ: support funclets in cold code
-
-    noway_assert(isHotCode || funKind == CORJIT_FUNC_ROOT);
+// We don't support hot/cold splitting with EH, so if there is cold code, this
+// better not be a funclet!
+// TODO-CQ: support funclets in cold code
+#ifdef DEBUG
+    if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL))
+    {
+        noway_assert(isHotCode && (funKind == CORJIT_FUNC_ROOT));
+    }
+    else
+#endif // DEBUG
+    {
+        noway_assert(isHotCode || (funKind == CORJIT_FUNC_ROOT));
+    }
 
     // Compute the final size, and start and end offsets of the fragment
 
@@ -1656,7 +1669,17 @@ void UnwindFragmentInfo::Allocate(
 
     if (isHotCode)
     {
-        assert(endOffset <= uwiComp->info.compTotalHotCodeSize);
+#ifdef DEBUG
+        if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL))
+        {
+            assert(endOffset <= uwiComp->info.compNativeCodeSize);
+        }
+        else
+#endif // DEBUG
+        {
+            assert(endOffset <= uwiComp->info.compTotalHotCodeSize);
+        }
+
         pColdCode = NULL;
     }
     else
diff --git a/src/coreclr/jit/unwindx86.cpp b/src/coreclr/jit/unwindx86.cpp
index bd27e46cbef49..32d077429af6a 100644
--- a/src/coreclr/jit/unwindx86.cpp
+++ b/src/coreclr/jit/unwindx86.cpp
@@ -113,18 +113,17 @@ void Compiler::unwindEmit(void* pHotCode, void* pColdCode)
 //
 void Compiler::unwindReserveFunc(FuncInfoDsc* func)
 {
-#ifdef DEBUG
-    if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr))
+    unwindReserveFuncHelper(func, true);
+
+    if (fgFirstColdBlock != nullptr)
     {
-        assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets.
-        unwindReserveFuncHelper(func, true);
-    }
-    else
+#ifdef DEBUG
+        if (JitConfig.JitFakeProcedureSplitting())
+        {
+            assert(func->funKind == FUNC_ROOT); // No splitting of funclets.
+        }
+        else
 #endif // DEBUG
-    {
-        unwindReserveFuncHelper(func, true);
-
-        if (fgFirstColdBlock != nullptr)
         {
             unwindReserveFuncHelper(func, false);
         }
@@ -164,17 +163,13 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode
     static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER);
     static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER);
 
-#ifdef DEBUG
-    if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr))
+    unwindEmitFuncHelper(func, pHotCode, pColdCode, true);
+
+    if (pColdCode != nullptr)
     {
-        fakeUnwindEmitFuncHelper(func, pHotCode);
-    }
-    else
+#ifdef DEBUG
+        if (!JitConfig.JitFakeProcedureSplitting())
 #endif // DEBUG
-    {
-        unwindEmitFuncHelper(func, pHotCode, pColdCode, true);
-
-        if (pColdCode != nullptr)
         {
             unwindEmitFuncHelper(func, pHotCode, pColdCode, false);
         }
@@ -258,7 +253,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo
 
     if (isHotCode)
     {
-        assert(endOffset <= info.compTotalHotCodeSize);
+#ifdef DEBUG
+        if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr))
+        {
+            assert(endOffset <= info.compNativeCodeSize);
+        }
+        else
+#endif // DEBUG
+        {
+            assert(endOffset <= info.compTotalHotCodeSize);
+        }
+
         pColdCode = nullptr;
     }
     else
@@ -276,22 +281,4 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo
                       (BYTE*)&unwindInfo, (CorJitFuncKind)func->funKind);
 }
 
-#ifdef DEBUG
-void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode)
-{
-    assert(fgFirstColdBlock != nullptr);
-    assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets.
-
-    const UNATIVE_OFFSET startOffset = 0;
-    const UNATIVE_OFFSET endOffset   = info.compNativeCodeSize;
-
-    UNWIND_INFO unwindInfo;
-    unwindInfo.FunctionLength = (ULONG)(endOffset);
-
-    // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section
-    eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, sizeof(UNWIND_INFO), (BYTE*)&unwindInfo,
-                      (CorJitFuncKind)func->funKind);
-}
-#endif // DEBUG
-
 #endif // FEATURE_EH_FUNCLETS