diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index 10c14db1b89572..0292e160176485 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -2577,6 +2577,10 @@ Use Intel MCU ABI Generate branches with extended addressability, usually via indirect jumps. +.. option:: -mlvi-cfi, -mno-lvi-cfi + +Enable only control-flow mitigations for Load Value Injection (LVI) + .. option:: -mmacosx-version-min=, -mmacos-version-min= Set Mac OS X deployment target diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0a60873443fc0a..391c895a453bd5 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2267,6 +2267,14 @@ def mspeculative_load_hardening : Flag<["-"], "mspeculative-load-hardening">, Group, Flags<[CoreOption,CC1Option]>; def mno_speculative_load_hardening : Flag<["-"], "mno-speculative-load-hardening">, Group, Flags<[CoreOption]>; +def mlvi_hardening : Flag<["-"], "mlvi-hardening">, Group, Flags<[CoreOption,DriverOption]>, + HelpText<"Enable all mitigations for Load Value Injection (LVI)">; +def mno_lvi_hardening : Flag<["-"], "mno-lvi-hardening">, Group, Flags<[CoreOption,DriverOption]>, + HelpText<"Disable mitigations for Load Value Injection (LVI)">; +def mlvi_cfi : Flag<["-"], "mlvi-cfi">, Group, Flags<[CoreOption,DriverOption]>, + HelpText<"Enable only control-flow mitigations for Load Value Injection (LVI)">; +def mno_lvi_cfi : Flag<["-"], "mno-lvi-cfi">, Group, Flags<[CoreOption,DriverOption]>, + HelpText<"Disable control-flow mitigations for Load Value Injection (LVI)">; def mrelax : Flag<["-"], "mrelax">, Group, HelpText<"Enable linker relaxation">; diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index d1e0c8253b7998..d170b7ac3a7786 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -146,6 +146,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, // flags). This is a bit hacky but keeps existing usages working. We should // consider deprecating this and instead warn if the user requests external // retpoline thunks and *doesn't* request some form of retpolines. + auto SpectreOpt = clang::driver::options::ID::OPT_INVALID; if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline, options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening)) { @@ -153,12 +154,14 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, false)) { Features.push_back("+retpoline-indirect-calls"); Features.push_back("+retpoline-indirect-branches"); + SpectreOpt = options::OPT_mretpoline; } else if (Args.hasFlag(options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening, false)) { // On x86, speculative load hardening relies on at least using retpolines // for indirect calls. Features.push_back("+retpoline-indirect-calls"); + SpectreOpt = options::OPT_mspeculative_load_hardening; } } else if (Args.hasFlag(options::OPT_mretpoline_external_thunk, options::OPT_mno_retpoline_external_thunk, false)) { @@ -166,6 +169,26 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, // eventually switch to an error here. Features.push_back("+retpoline-indirect-calls"); Features.push_back("+retpoline-indirect-branches"); + SpectreOpt = options::OPT_mretpoline_external_thunk; + } + + auto LVIOpt = clang::driver::options::ID::OPT_INVALID; + if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening, + false)) { + Features.push_back("+lvi-load-hardening"); + Features.push_back("+lvi-cfi"); // load hardening implies CFI protection + LVIOpt = options::OPT_mlvi_hardening; + } else if (Args.hasFlag(options::OPT_mlvi_cfi, options::OPT_mno_lvi_cfi, + false)) { + Features.push_back("+lvi-cfi"); + LVIOpt = options::OPT_mlvi_cfi; + } + + if (SpectreOpt != clang::driver::options::ID::OPT_INVALID && + LVIOpt != clang::driver::options::ID::OPT_INVALID) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << D.getOpts().getOptionName(SpectreOpt) + << D.getOpts().getOptionName(LVIOpt); } // Now add any that the user explicitly requested on the command line, diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 9a406b504b24e0..fd9bf0878693a2 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -154,6 +154,30 @@ // SLH: "-mspeculative-load-hardening" // NO-SLH-NOT: retpoline +// RUN: %clang -target i386-linux-gnu -mlvi-cfi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVICFI %s +// RUN: %clang -target i386-linux-gnu -mno-lvi-cfi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-LVICFI %s +// LVICFI: "-target-feature" "+lvi-cfi" +// NO-LVICFI-NOT: lvi-cfi + +// RUN: %clang -target i386-linux-gnu -mlvi-cfi -mspeculative-load-hardening %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVICFI-SLH %s +// LVICFI-SLH: error: invalid argument 'mspeculative-load-hardening' not allowed with 'mlvi-cfi' +// RUN: %clang -target i386-linux-gnu -mlvi-cfi -mretpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVICFI-RETPOLINE %s +// LVICFI-RETPOLINE: error: invalid argument 'mretpoline' not allowed with 'mlvi-cfi' +// RUN: %clang -target i386-linux-gnu -mlvi-cfi -mretpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVICFI-RETPOLINE-EXTERNAL-THUNK %s +// LVICFI-RETPOLINE-EXTERNAL-THUNK: error: invalid argument 'mretpoline-external-thunk' not allowed with 'mlvi-cfi' + +// RUN: %clang -target i386-linux-gnu -mlvi-hardening %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVIHARDENING %s +// RUN: %clang -target i386-linux-gnu -mno-lvi-hardening %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-LVIHARDENING %s +// LVIHARDENING: "-target-feature" "+lvi-load-hardening" "-target-feature" "+lvi-cfi" +// NO-LVIHARDENING-NOT: lvi + +// RUN: %clang -target i386-linux-gnu -mlvi-hardening -mspeculative-load-hardening %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVIHARDENING-SLH %s +// LVIHARDENING-SLH: error: invalid argument 'mspeculative-load-hardening' not allowed with 'mlvi-hardening' +// RUN: %clang -target i386-linux-gnu -mlvi-hardening -mretpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVIHARDENING-RETPOLINE %s +// LVIHARDENING-RETPOLINE: error: invalid argument 'mretpoline' not allowed with 'mlvi-hardening' +// RUN: %clang -target i386-linux-gnu -mlvi-hardening -mretpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=LVIHARDENING-RETPOLINE-EXTERNAL-THUNK %s +// LVIHARDENING-RETPOLINE-EXTERNAL-THUNK: error: invalid argument 'mretpoline-external-thunk' not allowed with 'mlvi-hardening' + // RUN: %clang -target i386-linux-gnu -mwaitpkg %s -### -o %t.o 2>&1 | FileCheck -check-prefix=WAITPKG %s // RUN: %clang -target i386-linux-gnu -mno-waitpkg %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-WAITPKG %s // WAITPKG: "-target-feature" "+waitpkg" diff --git a/llvm/lib/Target/Hexagon/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h similarity index 100% rename from llvm/lib/Target/Hexagon/RDFGraph.h rename to llvm/include/llvm/CodeGen/RDFGraph.h diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.h b/llvm/include/llvm/CodeGen/RDFLiveness.h similarity index 100% rename from llvm/lib/Target/Hexagon/RDFLiveness.h rename to llvm/include/llvm/CodeGen/RDFLiveness.h diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h similarity index 100% rename from llvm/lib/Target/Hexagon/RDFRegisters.h rename to llvm/include/llvm/CodeGen/RDFRegisters.h diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 470b027e38c81e..a3916b7c624209 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -114,6 +114,9 @@ add_llvm_component_library(LLVMCodeGen ProcessImplicitDefs.cpp PrologEpilogInserter.cpp PseudoSourceValue.cpp + RDFGraph.cpp + RDFLiveness.cpp + RDFRegisters.cpp ReachingDefAnalysis.cpp RegAllocBase.cpp RegAllocBasic.cpp diff --git a/llvm/lib/Target/Hexagon/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp similarity index 99% rename from llvm/lib/Target/Hexagon/RDFGraph.cpp rename to llvm/lib/CodeGen/RDFGraph.cpp index 0cb35dc9881964..437a6b03009678 100644 --- a/llvm/lib/Target/Hexagon/RDFGraph.cpp +++ b/llvm/lib/CodeGen/RDFGraph.cpp @@ -8,8 +8,6 @@ // // Target-independent, SSA-based data flow graph for register data flow (RDF). // -#include "RDFGraph.h" -#include "RDFRegisters.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -20,6 +18,8 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -753,8 +753,10 @@ RegisterSet DataFlowGraph::getLandingPadLiveIns() const { const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); if (RegisterId R = TLI.getExceptionPointerRegister(PF)) LR.insert(RegisterRef(R)); - if (RegisterId R = TLI.getExceptionSelectorRegister(PF)) - LR.insert(RegisterRef(R)); + if (!isFuncletEHPersonality(classifyEHPersonality(PF))) { + if (RegisterId R = TLI.getExceptionSelectorRegister(PF)) + LR.insert(RegisterRef(R)); + } return LR; } diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp similarity index 99% rename from llvm/lib/Target/Hexagon/RDFLiveness.cpp rename to llvm/lib/CodeGen/RDFLiveness.cpp index e2c007c9d01af1..0bcd27f8ea452c 100644 --- a/llvm/lib/Target/Hexagon/RDFLiveness.cpp +++ b/llvm/lib/CodeGen/RDFLiveness.cpp @@ -22,9 +22,6 @@ // and Embedded Architectures and Compilers", 8 (4), // <10.1145/2086696.2086706>. // -#include "RDFLiveness.h" -#include "RDFGraph.h" -#include "RDFRegisters.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -33,6 +30,9 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp similarity index 99% rename from llvm/lib/Target/Hexagon/RDFRegisters.cpp rename to llvm/lib/CodeGen/RDFRegisters.cpp index b5675784e34b8b..bd8661816e7180 100644 --- a/llvm/lib/Target/Hexagon/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "RDFRegisters.h" #include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 3536aa81fb2156..747f14e0ceca07 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -64,9 +64,6 @@ add_llvm_target(HexagonCodeGen HexagonVLIWPacketizer.cpp RDFCopy.cpp RDFDeadCode.cpp - RDFGraph.cpp - RDFLiveness.cpp - RDFRegisters.cpp ) add_subdirectory(AsmParser) diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 886034d9601ac9..f1fe51f5e54fc1 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -12,9 +12,6 @@ #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "MCTargetDesc/HexagonBaseInfo.h" -#include "RDFGraph.h" -#include "RDFLiveness.h" -#include "RDFRegisters.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" @@ -27,6 +24,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp index 517ad1c6ee7b49..f26e23befde214 100644 --- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -11,9 +11,6 @@ #include "MCTargetDesc/HexagonBaseInfo.h" #include "RDFCopy.h" #include "RDFDeadCode.h" -#include "RDFGraph.h" -#include "RDFLiveness.h" -#include "RDFRegisters.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -24,6 +21,9 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index a9d39fd4b2dcbf..34d58f0a7a2306 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -11,13 +11,13 @@ //===----------------------------------------------------------------------===// #include "RDFCopy.h" -#include "RDFGraph.h" -#include "RDFLiveness.h" -#include "RDFRegisters.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/MCRegisterInfo.h" diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h index 1450ab8848492b..99b18a75d8c2b4 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.h +++ b/llvm/lib/Target/Hexagon/RDFCopy.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H -#include "RDFGraph.h" -#include "RDFLiveness.h" -#include "RDFRegisters.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/MachineFunction.h" #include #include diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp index af86c7b1956bc5..5a98debd3c0007 100644 --- a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp +++ b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp @@ -9,13 +9,13 @@ // RDF-based generic dead code elimination. #include "RDFDeadCode.h" -#include "RDFGraph.h" -#include "RDFLiveness.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" #include "llvm/Support/Debug.h" #include diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.h b/llvm/lib/Target/Hexagon/RDFDeadCode.h index 7f91977e1d6cd5..859c8161d355c6 100644 --- a/llvm/lib/Target/Hexagon/RDFDeadCode.h +++ b/llvm/lib/Target/Hexagon/RDFDeadCode.h @@ -23,8 +23,8 @@ #ifndef RDF_DEADCODE_H #define RDF_DEADCODE_H -#include "RDFGraph.h" -#include "RDFLiveness.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" #include "llvm/ADT/SetVector.h" namespace llvm { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index d37d812df485e3..01d756fc51d2f6 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -31,6 +31,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -39,6 +40,11 @@ using namespace llvm; +static cl::opt LVIInlineAsmHardening( + "x86-experimental-lvi-inline-asm-hardening", + cl::desc("Harden inline assembly code that may be vulnerable to Load Value" + " Injection (LVI). This feature is experimental."), cl::Hidden); + static bool checkScale(unsigned Scale, StringRef &ErrMsg) { if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; @@ -927,6 +933,11 @@ class X86AsmParser : public MCTargetAsmParser { bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); + // Load Value Injection (LVI) Mitigations for machine code + void emitWarningForSpecialLVIInstruction(SMLoc Loc); + bool applyLVICFIMitigation(MCInst &Inst); + bool applyLVILoadHardeningMitigation(MCInst &Inst, MCStreamer &Out); + /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds /// instrumentation around Inst. void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); @@ -3096,9 +3107,104 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { static const char *getSubtargetFeatureName(uint64_t Val); +void X86AsmParser::emitWarningForSpecialLVIInstruction(SMLoc Loc) { + Warning(Loc, "Instruction may be vulnerable to LVI and " + "requires manual mitigation"); + Note(SMLoc(), "See https://software.intel.com/" + "security-software-guidance/insights/" + "deep-dive-load-value-injection#specialinstructions" + " for more information"); +} + +/// RET instructions and also instructions that indirect calls/jumps from memory +/// combine a load and a branch within a single instruction. To mitigate these +/// instructions against LVI, they must be decomposed into separate load and +/// branch instructions, with an LFENCE in between. For more details, see: +/// - X86LoadValueInjectionRetHardening.cpp +/// - X86LoadValueInjectionIndirectThunks.cpp +/// - https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection +/// +/// Returns `true` if a mitigation was applied or warning was emitted. +bool X86AsmParser::applyLVICFIMitigation(MCInst &Inst) { + // Information on control-flow instructions that require manual mitigation can + // be found here: + // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions + switch (Inst.getOpcode()) { + case X86::RETW: + case X86::RETL: + case X86::RETQ: + case X86::RETIL: + case X86::RETIQ: + case X86::RETIW: + case X86::JMP16m: + case X86::JMP32m: + case X86::JMP64m: + case X86::CALL16m: + case X86::CALL32m: + case X86::CALL64m: + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return true; + } + return false; +} + +/// To mitigate LVI, every instruction that performs a load can be followed by +/// an LFENCE instruction to squash any potential mis-speculation. There are +/// some instructions that require additional considerations, and may requre +/// manual mitigation. For more details, see: +/// https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection +/// +/// Returns `true` if a mitigation was applied or warning was emitted. +bool X86AsmParser::applyLVILoadHardeningMitigation(MCInst &Inst, + MCStreamer &Out) { + auto Opcode = Inst.getOpcode(); + auto Flags = Inst.getFlags(); + if ((Flags & X86::IP_HAS_REPEAT) || (Flags & X86::IP_HAS_REPEAT_NE)) { + // Information on REP string instructions that require manual mitigation can + // be found here: + // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions + switch (Opcode) { + case X86::CMPSB: + case X86::CMPSW: + case X86::CMPSL: + case X86::CMPSQ: + case X86::SCASB: + case X86::SCASW: + case X86::SCASL: + case X86::SCASQ: + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return true; + } + } else if (Opcode == X86::REP_PREFIX || Opcode == X86::REPNE_PREFIX) { + // If a REP instruction is found on its own line, it may or may not be + // followed by a vulnerable instruction. Emit a warning just in case. + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return true; + } + + const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); + // LFENCE has the mayLoad property, don't double fence. + if (MCID.mayLoad() && Inst.getOpcode() != X86::LFENCE) { + MCInst FenceInst; + FenceInst.setOpcode(X86::LFENCE); + FenceInst.setLoc(Inst.getLoc()); + Out.EmitInstruction(FenceInst, getSTI()); + return true; + } + return false; +} + void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { Out.EmitInstruction(Inst, getSTI()); + + if (LVIInlineAsmHardening) { + if (getSTI().getFeatureBits()[X86::FeatureLVIControlFlowIntegrity] && + applyLVICFIMitigation(Inst)) + return; + if (getSTI().getFeatureBits()[X86::FeatureLVILoadHardening]) + applyLVILoadHardeningMitigation(Inst, Out); + } } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 58f2292dd4cd78..524e043c7df8e6 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -44,6 +44,7 @@ set(sources X86ISelDAGToDAG.cpp X86ISelLowering.cpp X86IndirectBranchTracking.cpp + X86IndirectThunks.cpp X86InterleavedAccess.cpp X86InsertPrefetch.cpp X86InstrFMA3Info.cpp @@ -51,6 +52,8 @@ set(sources X86InstrInfo.cpp X86EvexToVex.cpp X86LegalizerInfo.cpp + X86LoadValueInjectionLoadHardening.cpp + X86LoadValueInjectionRetHardening.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp X86MacroFusion.cpp @@ -58,7 +61,6 @@ set(sources X86PadShortFunction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp - X86RetpolineThunks.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp X86SpeculativeLoadHardening.cpp diff --git a/llvm/lib/Target/X86/ImmutableGraph.h b/llvm/lib/Target/X86/ImmutableGraph.h new file mode 100644 index 00000000000000..5833017037a5ce --- /dev/null +++ b/llvm/lib/Target/X86/ImmutableGraph.h @@ -0,0 +1,446 @@ +//==========-- ImmutableGraph.h - A fast DAG implementation ---------=========// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Description: ImmutableGraph is a fast DAG implementation that cannot be +/// modified, except by creating a new ImmutableGraph. ImmutableGraph is +/// implemented as two arrays: one containing nodes, and one containing edges. +/// The advantages to this implementation are two-fold: +/// 1. Iteration and traversal operations benefit from cache locality. +/// 2. Operations on sets of nodes/edges are efficient, and representations of +/// those sets in memory are compact. For instance, a set of edges is +/// implemented as a bit vector, wherein each bit corresponds to one edge in +/// the edge array. This implies a lower bound of 64x spatial improvement +/// over, e.g., an llvm::DenseSet or llvm::SmallSet. It also means that +/// insert/erase/contains operations complete in negligible constant time: +/// insert and erase require one load and one store, and contains requires +/// just one load. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H +#define LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +namespace llvm { + +template class ImmutableGraph { + using Traits = GraphTraits *>; + template friend class ImmutableGraphBuilder; + +public: + using node_value_type = NodeValueT; + using edge_value_type = EdgeValueT; + using size_type = int; + class Node; + class Edge { + friend class ImmutableGraph; + template friend class ImmutableGraphBuilder; + + const Node *Dest; + edge_value_type Value; + + public: + const Node *getDest() const { return Dest; }; + const edge_value_type &getValue() const { return Value; } + }; + class Node { + friend class ImmutableGraph; + template friend class ImmutableGraphBuilder; + + const Edge *Edges; + node_value_type Value; + + public: + const node_value_type &getValue() const { return Value; } + + const Edge *edges_begin() const { return Edges; } + // Nodes are allocated sequentially. Edges for a node are stored together. + // The end of this Node's edges is the beginning of the next node's edges. + // An extra node was allocated to hold the end pointer for the last real + // node. + const Edge *edges_end() const { return (this + 1)->Edges; } + ArrayRef edges() const { + return makeArrayRef(edges_begin(), edges_end()); + } + }; + +protected: + ImmutableGraph(std::unique_ptr Nodes, std::unique_ptr Edges, + size_type NodesSize, size_type EdgesSize) + : Nodes(std::move(Nodes)), Edges(std::move(Edges)), NodesSize(NodesSize), + EdgesSize(EdgesSize) {} + ImmutableGraph(const ImmutableGraph &) = delete; + ImmutableGraph(ImmutableGraph &&) = delete; + ImmutableGraph &operator=(const ImmutableGraph &) = delete; + ImmutableGraph &operator=(ImmutableGraph &&) = delete; + +public: + ArrayRef nodes() const { return makeArrayRef(Nodes.get(), NodesSize); } + const Node *nodes_begin() const { return nodes().begin(); } + const Node *nodes_end() const { return nodes().end(); } + + ArrayRef edges() const { return makeArrayRef(Edges.get(), EdgesSize); } + const Edge *edges_begin() const { return edges().begin(); } + const Edge *edges_end() const { return edges().end(); } + + size_type nodes_size() const { return NodesSize; } + size_type edges_size() const { return EdgesSize; } + + // Node N must belong to this ImmutableGraph. + size_type getNodeIndex(const Node &N) const { + return std::distance(nodes_begin(), &N); + } + // Edge E must belong to this ImmutableGraph. + size_type getEdgeIndex(const Edge &E) const { + return std::distance(edges_begin(), &E); + } + + // FIXME: Could NodeSet and EdgeSet be templated to share code? + class NodeSet { + const ImmutableGraph &G; + BitVector V; + + public: + NodeSet(const ImmutableGraph &G, bool ContainsAll = false) + : G{G}, V{static_cast(G.nodes_size()), ContainsAll} {} + bool insert(const Node &N) { + size_type Idx = G.getNodeIndex(N); + bool AlreadyExists = V.test(Idx); + V.set(Idx); + return !AlreadyExists; + } + void erase(const Node &N) { + size_type Idx = G.getNodeIndex(N); + V.reset(Idx); + } + bool contains(const Node &N) const { + size_type Idx = G.getNodeIndex(N); + return V.test(Idx); + } + void clear() { V.reset(); } + size_type empty() const { return V.none(); } + /// Return the number of elements in the set + size_type count() const { return V.count(); } + /// Return the size of the set's domain + size_type size() const { return V.size(); } + /// Set union + NodeSet &operator|=(const NodeSet &RHS) { + assert(&this->G == &RHS.G); + V |= RHS.V; + return *this; + } + /// Set intersection + NodeSet &operator&=(const NodeSet &RHS) { + assert(&this->G == &RHS.G); + V &= RHS.V; + return *this; + } + /// Set disjoint union + NodeSet &operator^=(const NodeSet &RHS) { + assert(&this->G == &RHS.G); + V ^= RHS.V; + return *this; + } + + using index_iterator = typename BitVector::const_set_bits_iterator; + index_iterator index_begin() const { return V.set_bits_begin(); } + index_iterator index_end() const { return V.set_bits_end(); } + void set(size_type Idx) { V.set(Idx); } + void reset(size_type Idx) { V.reset(Idx); } + + class iterator { + const NodeSet &Set; + size_type Current; + + void advance() { + assert(Current != -1); + Current = Set.V.find_next(Current); + } + + public: + iterator(const NodeSet &Set, size_type Begin) + : Set{Set}, Current{Begin} {} + iterator operator++(int) { + iterator Tmp = *this; + advance(); + return Tmp; + } + iterator &operator++() { + advance(); + return *this; + } + Node *operator*() const { + assert(Current != -1); + return Set.G.nodes_begin() + Current; + } + bool operator==(const iterator &other) const { + assert(&this->Set == &other.Set); + return this->Current == other.Current; + } + bool operator!=(const iterator &other) const { return !(*this == other); } + }; + + iterator begin() const { return iterator{*this, V.find_first()}; } + iterator end() const { return iterator{*this, -1}; } + }; + + class EdgeSet { + const ImmutableGraph &G; + BitVector V; + + public: + EdgeSet(const ImmutableGraph &G, bool ContainsAll = false) + : G{G}, V{static_cast(G.edges_size()), ContainsAll} {} + bool insert(const Edge &E) { + size_type Idx = G.getEdgeIndex(E); + bool AlreadyExists = V.test(Idx); + V.set(Idx); + return !AlreadyExists; + } + void erase(const Edge &E) { + size_type Idx = G.getEdgeIndex(E); + V.reset(Idx); + } + bool contains(const Edge &E) const { + size_type Idx = G.getEdgeIndex(E); + return V.test(Idx); + } + void clear() { V.reset(); } + bool empty() const { return V.none(); } + /// Return the number of elements in the set + size_type count() const { return V.count(); } + /// Return the size of the set's domain + size_type size() const { return V.size(); } + /// Set union + EdgeSet &operator|=(const EdgeSet &RHS) { + assert(&this->G == &RHS.G); + V |= RHS.V; + return *this; + } + /// Set intersection + EdgeSet &operator&=(const EdgeSet &RHS) { + assert(&this->G == &RHS.G); + V &= RHS.V; + return *this; + } + /// Set disjoint union + EdgeSet &operator^=(const EdgeSet &RHS) { + assert(&this->G == &RHS.G); + V ^= RHS.V; + return *this; + } + + using index_iterator = typename BitVector::const_set_bits_iterator; + index_iterator index_begin() const { return V.set_bits_begin(); } + index_iterator index_end() const { return V.set_bits_end(); } + void set(size_type Idx) { V.set(Idx); } + void reset(size_type Idx) { V.reset(Idx); } + + class iterator { + const EdgeSet &Set; + size_type Current; + + void advance() { + assert(Current != -1); + Current = Set.V.find_next(Current); + } + + public: + iterator(const EdgeSet &Set, size_type Begin) + : Set{Set}, Current{Begin} {} + iterator operator++(int) { + iterator Tmp = *this; + advance(); + return Tmp; + } + iterator &operator++() { + advance(); + return *this; + } + Edge *operator*() const { + assert(Current != -1); + return Set.G.edges_begin() + Current; + } + bool operator==(const iterator &other) const { + assert(&this->Set == &other.Set); + return this->Current == other.Current; + } + bool operator!=(const iterator &other) const { return !(*this == other); } + }; + + iterator begin() const { return iterator{*this, V.find_first()}; } + iterator end() const { return iterator{*this, -1}; } + }; + +private: + std::unique_ptr Nodes; + std::unique_ptr Edges; + size_type NodesSize; + size_type EdgesSize; +}; + +template class ImmutableGraphBuilder { + using node_value_type = typename GraphT::node_value_type; + using edge_value_type = typename GraphT::edge_value_type; + static_assert( + std::is_base_of, + GraphT>::value, + "Template argument to ImmutableGraphBuilder must derive from " + "ImmutableGraph<>"); + using size_type = typename GraphT::size_type; + using NodeSet = typename GraphT::NodeSet; + using Node = typename GraphT::Node; + using EdgeSet = typename GraphT::EdgeSet; + using Edge = typename GraphT::Edge; + using BuilderEdge = std::pair; + using EdgeList = std::vector; + using BuilderVertex = std::pair; + using VertexVec = std::vector; + +public: + using BuilderNodeRef = size_type; + + BuilderNodeRef addVertex(const node_value_type &V) { + auto I = AdjList.emplace(AdjList.end(), V, EdgeList{}); + return std::distance(AdjList.begin(), I); + } + + void addEdge(const edge_value_type &E, BuilderNodeRef From, + BuilderNodeRef To) { + AdjList[From].second.emplace_back(E, To); + } + + bool empty() const { return AdjList.empty(); } + + template std::unique_ptr get(ArgT &&... Args) { + size_type VertexSize = AdjList.size(), EdgeSize = 0; + for (const auto &V : AdjList) { + EdgeSize += V.second.size(); + } + auto VertexArray = + std::make_unique(VertexSize + 1 /* terminator node */); + auto EdgeArray = std::make_unique(EdgeSize); + size_type VI = 0, EI = 0; + for (; VI < VertexSize; ++VI) { + VertexArray[VI].Value = std::move(AdjList[VI].first); + VertexArray[VI].Edges = &EdgeArray[EI]; + auto NumEdges = static_cast(AdjList[VI].second.size()); + for (size_type VEI = 0; VEI < NumEdges; ++VEI, ++EI) { + auto &E = AdjList[VI].second[VEI]; + EdgeArray[EI].Value = std::move(E.first); + EdgeArray[EI].Dest = &VertexArray[E.second]; + } + } + assert(VI == VertexSize && EI == EdgeSize && "ImmutableGraph malformed"); + VertexArray[VI].Edges = &EdgeArray[EdgeSize]; // terminator node + return std::make_unique(std::move(VertexArray), + std::move(EdgeArray), VertexSize, EdgeSize, + std::forward(Args)...); + } + + template + static std::unique_ptr trim(const GraphT &G, const NodeSet &TrimNodes, + const EdgeSet &TrimEdges, + ArgT &&... Args) { + size_type NewVertexSize = G.nodes_size() - TrimNodes.count(); + size_type NewEdgeSize = G.edges_size() - TrimEdges.count(); + auto NewVertexArray = + std::make_unique(NewVertexSize + 1 /* terminator node */); + auto NewEdgeArray = std::make_unique(NewEdgeSize); + + // Walk the nodes and determine the new index for each node. + size_type NewNodeIndex = 0; + std::vector RemappedNodeIndex(G.nodes_size()); + for (const Node &N : G.nodes()) { + if (TrimNodes.contains(N)) + continue; + RemappedNodeIndex[G.getNodeIndex(N)] = NewNodeIndex++; + } + assert(NewNodeIndex == NewVertexSize && + "Should have assigned NewVertexSize indices"); + + size_type VertexI = 0, EdgeI = 0; + for (const Node &N : G.nodes()) { + if (TrimNodes.contains(N)) + continue; + NewVertexArray[VertexI].Value = N.getValue(); + NewVertexArray[VertexI].Edges = &NewEdgeArray[EdgeI]; + for (const Edge &E : N.edges()) { + if (TrimEdges.contains(E)) + continue; + NewEdgeArray[EdgeI].Value = E.getValue(); + size_type DestIdx = G.getNodeIndex(*E.getDest()); + size_type NewIdx = RemappedNodeIndex[DestIdx]; + assert(NewIdx < NewVertexSize); + NewEdgeArray[EdgeI].Dest = &NewVertexArray[NewIdx]; + ++EdgeI; + } + ++VertexI; + } + assert(VertexI == NewVertexSize && EdgeI == NewEdgeSize && + "Gadget graph malformed"); + NewVertexArray[VertexI].Edges = &NewEdgeArray[NewEdgeSize]; // terminator + return std::make_unique(std::move(NewVertexArray), + std::move(NewEdgeArray), NewVertexSize, + NewEdgeSize, std::forward(Args)...); + } + +private: + VertexVec AdjList; +}; + +template +struct GraphTraits *> { + using GraphT = ImmutableGraph; + using NodeRef = typename GraphT::Node const *; + using EdgeRef = typename GraphT::Edge const &; + + static NodeRef edge_dest(EdgeRef E) { return E.getDest(); } + using ChildIteratorType = + mapped_iterator; + + static NodeRef getEntryNode(GraphT *G) { return G->nodes_begin(); } + static ChildIteratorType child_begin(NodeRef N) { + return {N->edges_begin(), &edge_dest}; + } + static ChildIteratorType child_end(NodeRef N) { + return {N->edges_end(), &edge_dest}; + } + + static NodeRef getNode(typename GraphT::Node const &N) { return NodeRef{&N}; } + using nodes_iterator = + mapped_iterator; + static nodes_iterator nodes_begin(GraphT *G) { + return {G->nodes_begin(), &getNode}; + } + static nodes_iterator nodes_end(GraphT *G) { + return {G->nodes_end(), &getNode}; + } + + using ChildEdgeIteratorType = typename GraphT::Edge const *; + + static ChildEdgeIteratorType child_edge_begin(NodeRef N) { + return N->edges_begin(); + } + static ChildEdgeIteratorType child_edge_end(NodeRef N) { + return N->edges_end(); + } + static typename GraphT::size_type size(GraphT *G) { return G->nodes_size(); } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 0481a40d462aeb..39b2f814defaab 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -120,7 +120,7 @@ FunctionPass *createX86DomainReassignmentPass(); FunctionPass *createX86EvexToVexInsts(); /// This pass creates the thunks for the retpoline feature. -FunctionPass *createX86RetpolineThunksPass(); +FunctionPass *createX86IndirectThunksPass(); /// This pass ensures instructions featuring a memory operand /// have distinctive (with respect to eachother) @@ -133,6 +133,8 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); +FunctionPass *createX86LoadValueInjectionLoadHardeningPass(); +FunctionPass *createX86LoadValueInjectionRetHardeningPass(); FunctionPass *createX86SpeculativeLoadHardeningPass(); void initializeEvexToVexInstPassPass(PassRegistry &); @@ -148,6 +150,8 @@ void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); +void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &); +void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a2b11d55f65095..bb8952f54e3abc 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -426,6 +426,22 @@ def FeatureRetpolineExternalThunk "ourselves. Only has effect when combined with some other retpoline " "feature", [FeatureRetpolineIndirectCalls]>; +// Mitigate LVI attacks against indirect calls/branches and call returns +def FeatureLVIControlFlowIntegrity + : SubtargetFeature< + "lvi-cfi", "UseLVIControlFlowIntegrity", "true", + "Prevent indirect calls/branches from using a memory operand, and " + "precede all indirect calls/branches from a register with an " + "LFENCE instruction to serialize control flow. Also decompose RET " + "instructions into a POP+LFENCE+JMP sequence.">; + +// Mitigate LVI attacks against data loads +def FeatureLVILoadHardening + : SubtargetFeature< + "lvi-load-hardening", "UseLVILoadHardening", "true", + "Insert LFENCE instructions to prevent data speculatively injected " + "into loads from being used maliciously.">; + // Direct Move instructions. def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", "Support movdiri instruction">; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 1dbf406835646d..a1d256ea872d80 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3202,8 +3202,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; - // Functions using retpoline for indirect calls need to use SDISel. - if (Subtarget->useRetpolineIndirectCalls()) + // Functions using thunks for indirect calls need to use SDISel. + if (Subtarget->useIndirectThunkCalls()) return false; // Handle only C, fastcc, and webkit_js calling conventions for now. diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 799c1f5d1285ec..1da20371caf514 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -765,10 +765,10 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; - // FIXME: Add retpoline support and remove this. - if (Is64Bit && IsLargeCodeModel && STI.useRetpolineIndirectCalls()) + // FIXME: Add indirect thunk support and remove this. + if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls()) report_fatal_error("Emitting stack probe calls on 64-bit with the large " - "code model and retpoline not yet implemented."); + "code model and indirect thunks not yet implemented."); unsigned CallOp; if (Is64Bit) @@ -2493,9 +2493,9 @@ void X86FrameLowering::adjustForSegmentedStacks( // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. // FIXME: Add retpoline support and remove the error here.. - if (STI.useRetpolineIndirectCalls()) + if (STI.useIndirectThunkCalls()) report_fatal_error("Emitting morestack calls on 64-bit with the large " - "code model and retpoline not yet implemented."); + "code model and thunks not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index bf33f399db28ae..88af0ebcfd0e8e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -987,7 +987,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if (OptLevel != CodeGenOpt::None && // Only do this when the target can fold the load into the call or // jmp. - !Subtarget->useRetpolineIndirectCalls() && + !Subtarget->useIndirectThunkCalls() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && (Subtarget->is64Bit() || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 60eefbc677da5c..7a5075cefda93c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30218,8 +30218,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef Mask, } bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { - // If the subtarget is using retpolines, we need to not generate jump tables. - if (Subtarget.useRetpolineIndirectBranches()) + // If the subtarget is using thunks, we need to not generate jump tables. + if (Subtarget.useIndirectThunkBranches()) return false; // Otherwise, fallback on the generic logic. @@ -31342,22 +31342,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, return BB; } -static unsigned getOpcodeForRetpoline(unsigned RPOpc) { +static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) { switch (RPOpc) { - case X86::RETPOLINE_CALL32: + case X86::INDIRECT_THUNK_CALL32: return X86::CALLpcrel32; - case X86::RETPOLINE_CALL64: + case X86::INDIRECT_THUNK_CALL64: return X86::CALL64pcrel32; - case X86::RETPOLINE_TCRETURN32: + case X86::INDIRECT_THUNK_TCRETURN32: return X86::TCRETURNdi; - case X86::RETPOLINE_TCRETURN64: + case X86::INDIRECT_THUNK_TCRETURN64: return X86::TCRETURNdi64; } - llvm_unreachable("not retpoline opcode"); + llvm_unreachable("not indirect thunk opcode"); } -static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, - unsigned Reg) { +static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget, + unsigned Reg) { if (Subtarget.useRetpolineExternalThunk()) { // When using an external thunk for retpolines, we pick names that match the // names GCC happens to use as well. This helps simplify the implementation @@ -31389,39 +31389,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__x86_indirect_thunk_r11"; } + llvm_unreachable("unexpected reg for external indirect thunk"); + } + + if (Subtarget.useRetpolineIndirectCalls() || + Subtarget.useRetpolineIndirectBranches()) { + // When targeting an internal COMDAT thunk use an LLVM-specific name. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__llvm_retpoline_r11"; + } llvm_unreachable("unexpected reg for retpoline"); } - // When targeting an internal COMDAT thunk use an LLVM-specific name. - switch (Reg) { - case X86::EAX: - assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); - return "__llvm_retpoline_eax"; - case X86::ECX: - assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); - return "__llvm_retpoline_ecx"; - case X86::EDX: - assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); - return "__llvm_retpoline_edx"; - case X86::EDI: - assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); - return "__llvm_retpoline_edi"; - case X86::R11: + if (Subtarget.useLVIControlFlowIntegrity()) { assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); - return "__llvm_retpoline_r11"; + return "__llvm_lvi_thunk_r11"; } - llvm_unreachable("unexpected reg for retpoline"); + llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature"); } MachineBasicBlock * -X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, - MachineBasicBlock *BB) const { +X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, + MachineBasicBlock *BB) const { // Copy the virtual register into the R11 physical register and // call the retpoline thunk. DebugLoc DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); Register CalleeVReg = MI.getOperand(0).getReg(); - unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); + unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode()); // Find an available scratch register to hold the callee. On 64-bit, we can // just use R11, but we scan for uses anyway to ensure we don't generate @@ -31455,7 +31464,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, report_fatal_error("calling convention incompatible with retpoline, no " "available registers"); - const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); + const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) .addReg(CalleeVReg); @@ -32231,11 +32240,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); - case X86::RETPOLINE_CALL32: - case X86::RETPOLINE_CALL64: - case X86::RETPOLINE_TCRETURN32: - case X86::RETPOLINE_TCRETURN64: - return EmitLoweredRetpoline(MI, BB); + case X86::INDIRECT_THUNK_CALL32: + case X86::INDIRECT_THUNK_CALL64: + case X86::INDIRECT_THUNK_TCRETURN32: + case X86::INDIRECT_THUNK_TCRETURN64: + return EmitLoweredIndirectThunk(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3a17099da38f11..830cdfc79c0a33 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1482,8 +1482,8 @@ namespace llvm { MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp new file mode 100644 index 00000000000000..36b9c3ccc959a5 --- /dev/null +++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp @@ -0,0 +1,364 @@ +//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86 --=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that injects an MI thunk that is used to lower indirect calls in a way +/// that prevents speculation on some x86 processors and can be used to mitigate +/// security vulnerabilities due to targeted speculative execution and side +/// channels such as CVE-2017-5715. +/// +/// Currently supported thunks include: +/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls +/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization +/// before making an indirect call/jump +/// +/// Note that the reason that this is implemented as a MachineFunctionPass and +/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline +/// serialize all transformations, which can consume lots of memory. +/// +/// TODO(chandlerc): All of this code could use better comments and +/// documentation. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-retpoline-thunks" + +static const char RetpolineNamePrefix[] = "__llvm_retpoline_"; +static const char R11RetpolineName[] = "__llvm_retpoline_r11"; +static const char EAXRetpolineName[] = "__llvm_retpoline_eax"; +static const char ECXRetpolineName[] = "__llvm_retpoline_ecx"; +static const char EDXRetpolineName[] = "__llvm_retpoline_edx"; +static const char EDIRetpolineName[] = "__llvm_retpoline_edi"; + +static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_"; +static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11"; + +namespace { +template class ThunkInserter { + Derived &getDerived() { return *static_cast(this); } + +protected: + bool InsertedThunks; + void doInitialization(Module &M) {} + void createThunkFunction(MachineModuleInfo &MMI, StringRef Name); + +public: + void init(Module &M) { + InsertedThunks = false; + getDerived().doInitialization(M); + } + // return `true` if `MMI` or `MF` was modified + bool run(MachineModuleInfo &MMI, MachineFunction &MF); +}; + +struct RetpolineThunkInserter : ThunkInserter { + const char *getThunkPrefix() { return RetpolineNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + const auto &STI = MF.getSubtarget(); + return (STI.useRetpolineIndirectCalls() || + STI.useRetpolineIndirectBranches()) && + !STI.useRetpolineExternalThunk(); + } + void insertThunks(MachineModuleInfo &MMI); + void populateThunk(MachineFunction &MF); +}; + +struct LVIThunkInserter : ThunkInserter { + const char *getThunkPrefix() { return LVIThunkNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + return MF.getSubtarget().useLVIControlFlowIntegrity(); + } + void insertThunks(MachineModuleInfo &MMI) { + createThunkFunction(MMI, R11LVIThunkName); + } + void populateThunk(MachineFunction &MF) { + // Grab the entry MBB and erase any other blocks. O0 codegen appears to + // generate two bbs for the entry block. + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + while (MF.size() > 1) + MF.erase(std::next(MF.begin())); + + // This code mitigates LVI by replacing each indirect call/jump with a + // direct call/jump to a thunk that looks like: + // ``` + // lfence + // jmpq *%r11 + // ``` + // This ensures that if the value in register %r11 was loaded from memory, + // then the value in %r11 is (architecturally) correct prior to the jump. + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11); + MF.front().addLiveIn(X86::R11); + return; + } +}; + +class X86IndirectThunks : public MachineFunctionPass { +public: + static char ID; + + X86IndirectThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "X86 Indirect Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + } + +private: + std::tuple TIs; + + // FIXME: When LLVM moves to C++17, these can become folds + template + static void initTIs(Module &M, + std::tuple &ThunkInserters) { + (void)std::initializer_list{ + (std::get(ThunkInserters).init(M), 0)...}; + } + template + static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, + std::tuple &ThunkInserters) { + bool Modified = false; + (void)std::initializer_list{ + Modified |= std::get(ThunkInserters).run(MMI, MF)...}; + return Modified; + } +}; + +} // end anonymous namespace + +void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) { + if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64) + createThunkFunction(MMI, R11RetpolineName); + else + for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName, + EDIRetpolineName}) + createThunkFunction(MMI, Name); +} + +void RetpolineThunkInserter::populateThunk(MachineFunction &MF) { + bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64; + Register ThunkReg; + if (Is64Bit) { + assert(MF.getName() == "__llvm_retpoline_r11" && + "Should only have an r11 thunk on 64-bit targets"); + + // __llvm_retpoline_r11: + // callq .Lr11_call_target + // .Lr11_capture_spec: + // pause + // lfence + // jmp .Lr11_capture_spec + // .align 16 + // .Lr11_call_target: + // movq %r11, (%rsp) + // retq + ThunkReg = X86::R11; + } else { + // For 32-bit targets we need to emit a collection of thunks for various + // possible scratch registers as well as a fallback that uses EDI, which is + // normally callee saved. + // __llvm_retpoline_eax: + // calll .Leax_call_target + // .Leax_capture_spec: + // pause + // jmp .Leax_capture_spec + // .align 16 + // .Leax_call_target: + // movl %eax, (%esp) # Clobber return addr + // retl + // + // __llvm_retpoline_ecx: + // ... # Same setup + // movl %ecx, (%esp) + // retl + // + // __llvm_retpoline_edx: + // ... # Same setup + // movl %edx, (%esp) + // retl + // + // __llvm_retpoline_edi: + // ... # Same setup + // movl %edi, (%esp) + // retl + if (MF.getName() == EAXRetpolineName) + ThunkReg = X86::EAX; + else if (MF.getName() == ECXRetpolineName) + ThunkReg = X86::ECX; + else if (MF.getName() == EDXRetpolineName) + ThunkReg = X86::EDX; + else if (MF.getName() == EDIRetpolineName) + ThunkReg = X86::EDI; + else + llvm_unreachable("Invalid thunk name on x86-32!"); + } + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + // Grab the entry MBB and erase any other blocks. O0 codegen appears to + // generate two bbs for the entry block. + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + while (MF.size() > 1) + MF.erase(std::next(MF.begin())); + + MachineBasicBlock *CaptureSpec = + MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MachineBasicBlock *CallTarget = + MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MCSymbol *TargetSym = MF.getContext().createTempSymbol(); + MF.push_back(CaptureSpec); + MF.push_back(CallTarget); + + const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; + + Entry->addLiveIn(ThunkReg); + BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym); + + // The MIR verifier thinks that the CALL in the entry block will fall through + // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is + // the successor, but the MIR verifier doesn't know how to cope with that. + Entry->addSuccessor(CaptureSpec); + + // In the capture loop for speculation, we want to stop the processor from + // speculating as fast as possible. On Intel processors, the PAUSE instruction + // will block speculation without consuming any execution resources. On AMD + // processors, the PAUSE instruction is (essentially) a nop, so we also use an + // LFENCE instruction which they have advised will stop speculation as well + // with minimal resource utilization. We still end the capture with a jump to + // form an infinite loop to fully guarantee that no matter what implementation + // of the x86 ISA, speculating this code path never escapes. + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); + CaptureSpec->setHasAddressTaken(); + CaptureSpec->addSuccessor(CaptureSpec); + + CallTarget->addLiveIn(ThunkReg); + CallTarget->setHasAddressTaken(); + CallTarget->setAlignment(Align(16)); + + // Insert return address clobber + const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; + const Register SPReg = Is64Bit ? X86::RSP : X86::ESP; + addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false, + 0) + .addReg(ThunkReg); + + CallTarget->back().setPreInstrSymbol(MF, TargetSym); + BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); +} + +template +void ThunkInserter::createThunkFunction(MachineModuleInfo &MMI, + StringRef Name) { + assert(Name.startswith(getDerived().getThunkPrefix()) && + "Created a thunk with an unexpected prefix!"); + + Module &M = const_cast(*MMI.getModule()); + LLVMContext &Ctx = M.getContext(); + auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = + Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); + F->setVisibility(GlobalValue::HiddenVisibility); + F->setComdat(M.getOrInsertComdat(Name)); + + // Add Attributes so that we don't create a frame, unwind information, or + // inline. + AttrBuilder B; + B.addAttribute(llvm::Attribute::NoUnwind); + B.addAttribute(llvm::Attribute::Naked); + F->addAttributes(llvm::AttributeList::FunctionIndex, B); + + // Populate our function a bit so that we can verify. + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); + IRBuilder<> Builder(Entry); + + Builder.CreateRetVoid(); + + // MachineFunctions/MachineBasicBlocks aren't created automatically for the + // IR-level constructs we already made. Create them and insert them into the + // module. + MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry); + + // Insert EntryMBB into MF. It's not in the module until we do this. + MF.insert(MF.end(), EntryMBB); + // Set MF properties. We never use vregs... + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); +} + +template +bool ThunkInserter::run(MachineModuleInfo &MMI, MachineFunction &MF) { + // If MF is not a thunk, check to see if we need to insert a thunk. + if (!MF.getName().startswith(getDerived().getThunkPrefix())) { + // If we've already inserted a thunk, nothing else to do. + if (InsertedThunks) + return false; + + // Only add a thunk if one of the functions has the corresponding feature + // enabled in its subtarget, and doesn't enable external thunks. + // FIXME: Conditionalize on indirect calls so we don't emit a thunk when + // nothing will end up calling it. + // FIXME: It's a little silly to look at every function just to enumerate + // the subtargets, but eventually we'll want to look at them for indirect + // calls, so maybe this is OK. + if (!getDerived().mayUseThunk(MF)) + return false; + + getDerived().insertThunks(MMI); + InsertedThunks = true; + return true; + } + + // If this *is* a thunk function, we need to populate it with the correct MI. + getDerived().populateThunk(MF); + return true; +} + +FunctionPass *llvm::createX86IndirectThunksPass() { + return new X86IndirectThunks(); +} + +char X86IndirectThunks::ID = 0; + +bool X86IndirectThunks::doInitialization(Module &M) { + initTIs(M, TIs); + return false; +} + +bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << '\n'); + auto &MMI = getAnalysis().getMMI(); + return runTIs(MMI, MF, TIs); +} diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 78d8dd3c0d0317..1fdac104cb73df 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1213,14 +1213,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode, NotUseRetpolineIndirectCalls]>; + Requires<[Not64BitMode, NotUseIndirectThunkCalls]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[Not64BitMode, IsNotPIC, NotUseRetpolineIndirectCalls]>; + Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi tglobaladdr:$dst, imm:$off)>, @@ -1232,21 +1232,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>; + Requires<[In64BitMode, NotUseIndirectThunkCalls]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>; + Requires<[In64BitMode, NotUseIndirectThunkCalls]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), - (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode, UseRetpolineIndirectCalls]>; + (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseIndirectThunkCalls]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), - (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode, UseRetpolineIndirectCalls]>; + (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseIndirectThunkCalls]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index 32faeb1a86f2b3..1842dc19ec2e10 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -237,13 +237,13 @@ let isCall = 1 in Sched<[WriteJumpLd]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32, - Requires<[Not64BitMode,NotUseRetpolineIndirectCalls]>, + Requires<[Not64BitMode,NotUseIndirectThunkCalls]>, Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>, OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall, - NotUseRetpolineIndirectCalls]>, + NotUseIndirectThunkCalls]>, Sched<[WriteJumpLd]>; // Non-tracking calls for IBT, use with caution. @@ -334,11 +334,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { Requires<[In64BitMode]>; def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, - Requires<[In64BitMode,NotUseRetpolineIndirectCalls]>; + Requires<[In64BitMode,NotUseIndirectThunkCalls]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, Requires<[In64BitMode,FavorMemIndirectCall, - NotUseRetpolineIndirectCalls]>; + NotUseIndirectThunkCalls]>; // Non-tracking calls for IBT, use with caution. let isCodeGenOnly = 1 in { @@ -393,19 +393,19 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1, SchedRW = [WriteJump] in { - def RETPOLINE_CALL32 : + def INDIRECT_THUNK_CALL32 : PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, - Requires<[Not64BitMode,UseRetpolineIndirectCalls]>; + Requires<[Not64BitMode,UseIndirectThunkCalls]>; - def RETPOLINE_CALL64 : + def INDIRECT_THUNK_CALL64 : PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, - Requires<[In64BitMode,UseRetpolineIndirectCalls]>; + Requires<[In64BitMode,UseIndirectThunkCalls]>; - // Retpoline variant of indirect tail calls. + // Indirect thunk variant of indirect tail calls. let isTerminator = 1, isReturn = 1, isBarrier = 1 in { - def RETPOLINE_TCRETURN64 : + def INDIRECT_THUNK_TCRETURN64 : PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; - def RETPOLINE_TCRETURN32 : + def INDIRECT_THUNK_TCRETURN32 : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; } } diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index ca5425e8b89fd3..93f40c8ec996ba 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -996,8 +996,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; -def UseRetpolineIndirectCalls : Predicate<"Subtarget->useRetpolineIndirectCalls()">; -def NotUseRetpolineIndirectCalls : Predicate<"!Subtarget->useRetpolineIndirectCalls()">; +def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">; +def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp new file mode 100644 index 00000000000000..50f8b3477acce8 --- /dev/null +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -0,0 +1,824 @@ +//==-- X86LoadValueInjectionLoadHardening.cpp - LVI load hardening for x86 --=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Description: This pass finds Load Value Injection (LVI) gadgets consisting +/// of a load from memory (i.e., SOURCE), and any operation that may transmit +/// the value loaded from memory over a covert channel, or use the value loaded +/// from memory to determine a branch/call target (i.e., SINK). After finding +/// all such gadgets in a given function, the pass minimally inserts LFENCE +/// instructions in such a manner that the following property is satisfied: for +/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at +/// least one LFENCE instruction. The algorithm that implements this minimal +/// insertion is influenced by an academic paper that minimally inserts memory +/// fences for high-performance concurrent programs: +/// http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf +/// The algorithm implemented in this pass is as follows: +/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the +/// following components: +/// - SOURCE instructions (also includes function arguments) +/// - SINK instructions +/// - Basic block entry points +/// - Basic block terminators +/// - LFENCE instructions +/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e., +/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been +/// mitigated, go to step 6. +/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion. +/// 4. Insert one LFENCE along each CFG edge that was cut in step 3. +/// 5. Go to step 2. +/// 6. If any LFENCEs were inserted, return `true` from runOnMachineFunction() +/// to tell LLVM that the function was modified. +/// +//===----------------------------------------------------------------------===// + +#include "ImmutableGraph.h" +#include "X86.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DOTGraphTraits.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define PASS_KEY "x86-lvi-load" +#define DEBUG_TYPE PASS_KEY + +STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation"); +STATISTIC(NumFunctionsConsidered, "Number of functions analyzed"); +STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations " + "were deployed"); +STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis"); + +static cl::opt OptimizePluginPath( + PASS_KEY "-opt-plugin", + cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden); + +static cl::opt NoConditionalBranches( + PASS_KEY "-no-cbranch", + cl::desc("Don't treat conditional branches as disclosure gadgets. This " + "may improve performance, at the cost of security."), + cl::init(false), cl::Hidden); + +static cl::opt EmitDot( + PASS_KEY "-dot", + cl::desc( + "For each function, emit a dot graph depicting potential LVI gadgets"), + cl::init(false), cl::Hidden); + +static cl::opt EmitDotOnly( + PASS_KEY "-dot-only", + cl::desc("For each function, emit a dot graph depicting potential LVI " + "gadgets, and do not insert any fences"), + cl::init(false), cl::Hidden); + +static cl::opt EmitDotVerify( + PASS_KEY "-dot-verify", + cl::desc("For each function, emit a dot graph to stdout depicting " + "potential LVI gadgets, used for testing purposes only"), + cl::init(false), cl::Hidden); + +static llvm::sys::DynamicLibrary OptimizeDL; +typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size, + unsigned int *edges, int *edge_values, + int *cut_edges /* out */, unsigned int edges_size); +static OptimizeCutT OptimizeCut = nullptr; + +namespace { + +struct MachineGadgetGraph : ImmutableGraph { + static constexpr int GadgetEdgeSentinel = -1; + static constexpr MachineInstr *const ArgNodeSentinel = nullptr; + + using GraphT = ImmutableGraph; + using Node = typename GraphT::Node; + using Edge = typename GraphT::Edge; + using size_type = typename GraphT::size_type; + MachineGadgetGraph(std::unique_ptr Nodes, + std::unique_ptr Edges, size_type NodesSize, + size_type EdgesSize, int NumFences = 0, int NumGadgets = 0) + : GraphT(std::move(Nodes), std::move(Edges), NodesSize, EdgesSize), + NumFences(NumFences), NumGadgets(NumGadgets) {} + static inline bool isCFGEdge(const Edge &E) { + return E.getValue() != GadgetEdgeSentinel; + } + static inline bool isGadgetEdge(const Edge &E) { + return E.getValue() == GadgetEdgeSentinel; + } + int NumFences; + int NumGadgets; +}; + +class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass { +public: + X86LoadValueInjectionLoadHardeningPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "X86 Load Value Injection (LVI) Load Hardening"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; + +private: + using GraphBuilder = ImmutableGraphBuilder; + using EdgeSet = MachineGadgetGraph::EdgeSet; + using NodeSet = MachineGadgetGraph::NodeSet; + using Gadget = std::pair; + + const X86Subtarget *STI; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + + std::unique_ptr + getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI, + const MachineDominatorTree &MDT, + const MachineDominanceFrontier &MDF) const; + int hardenLoadsWithPlugin(MachineFunction &MF, + std::unique_ptr Graph) const; + int hardenLoadsWithGreedyHeuristic( + MachineFunction &MF, std::unique_ptr Graph) const; + int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G, + EdgeSet &ElimEdges /* in, out */, + NodeSet &ElimNodes /* in, out */) const; + std::unique_ptr + trimMitigatedEdges(std::unique_ptr Graph) const; + void findAndCutEdges(MachineGadgetGraph &G, + EdgeSet &CutEdges /* out */) const; + int insertFences(MachineFunction &MF, MachineGadgetGraph &G, + EdgeSet &CutEdges /* in, out */) const; + bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const; + bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const; + inline bool isFence(const MachineInstr *MI) const { + return MI && (MI->getOpcode() == X86::LFENCE || + (STI->useLVIControlFlowIntegrity() && MI->isCall())); + } +}; + +} // end anonymous namespace + +namespace llvm { + +template <> +struct GraphTraits + : GraphTraits *> {}; + +template <> +struct DOTGraphTraits : DefaultDOTGraphTraits { + using GraphType = MachineGadgetGraph; + using Traits = llvm::GraphTraits; + using NodeRef = typename Traits::NodeRef; + using EdgeRef = typename Traits::EdgeRef; + using ChildIteratorType = typename Traits::ChildIteratorType; + using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + std::string getNodeLabel(NodeRef Node, GraphType *) { + if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel) + return "ARGS"; + + std::string Str; + raw_string_ostream OS(Str); + OS << *Node->getValue(); + return OS.str(); + } + + static std::string getNodeAttributes(NodeRef Node, GraphType *) { + MachineInstr *MI = Node->getValue(); + if (MI == MachineGadgetGraph::ArgNodeSentinel) + return "color = blue"; + if (MI->getOpcode() == X86::LFENCE) + return "color = green"; + return ""; + } + + static std::string getEdgeAttributes(NodeRef, ChildIteratorType E, + GraphType *) { + int EdgeVal = (*E.getCurrent()).getValue(); + return EdgeVal >= 0 ? "label = " + std::to_string(EdgeVal) + : "color = red, style = \"dashed\""; + } +}; + +} // end namespace llvm + +constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel; +constexpr int MachineGadgetGraph::GadgetEdgeSentinel; + +char X86LoadValueInjectionLoadHardeningPass::ID = 0; + +void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( + AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); +} + +static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF, + MachineGadgetGraph *G) { + WriteGraph(OS, G, /*ShortNames*/ false, + "Speculative gadgets for \"" + MF.getName() + "\" function"); +} + +bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( + MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName() + << " *****\n"); + STI = &MF.getSubtarget(); + if (!STI->useLVILoadHardening()) + return false; + + // FIXME: support 32-bit + if (!STI->is64Bit()) + report_fatal_error("LVI load hardening is only supported on 64-bit", false); + + // Don't skip functions with the "optnone" attr but participate in opt-bisect. + const Function &F = MF.getFunction(); + if (!F.hasOptNone() && skipFunction(F)) + return false; + + ++NumFunctionsConsidered; + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + LLVM_DEBUG(dbgs() << "Building gadget graph...\n"); + const auto &MLI = getAnalysis(); + const auto &MDT = getAnalysis(); + const auto &MDF = getAnalysis(); + std::unique_ptr Graph = getGadgetGraph(MF, MLI, MDT, MDF); + LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n"); + if (Graph == nullptr) + return false; // didn't find any gadgets + + if (EmitDotVerify) { + WriteGadgetGraph(outs(), MF, Graph.get()); + return false; + } + + if (EmitDot || EmitDotOnly) { + LLVM_DEBUG(dbgs() << "Emitting gadget graph...\n"); + std::error_code FileError; + std::string FileName = "lvi."; + FileName += MF.getName(); + FileName += ".dot"; + raw_fd_ostream FileOut(FileName, FileError); + if (FileError) + errs() << FileError.message(); + WriteGadgetGraph(FileOut, MF, Graph.get()); + FileOut.close(); + LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n"); + if (EmitDotOnly) + return false; + } + + int FencesInserted; + if (!OptimizePluginPath.empty()) { + if (!OptimizeDL.isValid()) { + std::string ErrorMsg; + OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary( + OptimizePluginPath.c_str(), &ErrorMsg); + if (!ErrorMsg.empty()) + report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"'); + OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut"); + if (!OptimizeCut) + report_fatal_error("Invalid optimization plugin"); + } + FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph)); + } else { // Use the default greedy heuristic + FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph)); + } + + if (FencesInserted > 0) + ++NumFunctionsMitigated; + NumFences += FencesInserted; + return (FencesInserted > 0); +} + +std::unique_ptr +X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( + MachineFunction &MF, const MachineLoopInfo &MLI, + const MachineDominatorTree &MDT, + const MachineDominanceFrontier &MDF) const { + using namespace rdf; + + // Build the Register Dataflow Graph using the RDF framework + TargetOperandInfo TOI{*TII}; + DataFlowGraph DFG{MF, *TII, *TRI, MDT, MDF, TOI}; + DFG.build(); + Liveness L{MF.getRegInfo(), DFG}; + L.computePhiInfo(); + + GraphBuilder Builder; + using GraphIter = typename GraphBuilder::BuilderNodeRef; + DenseMap NodeMap; + int FenceCount = 0, GadgetCount = 0; + auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) { + auto Ref = NodeMap.find(MI); + if (Ref == NodeMap.end()) { + auto I = Builder.addVertex(MI); + NodeMap[MI] = I; + return std::pair{I, true}; + } + return std::pair{Ref->getSecond(), false}; + }; + + // The `Transmitters` map memoizes transmitters found for each def. If a def + // has not yet been analyzed, then it will not appear in the map. If a def + // has been analyzed and was determined not to have any transmitters, then + // its list of transmitters will be empty. + DenseMap> Transmitters; + + // Analyze all machine instructions to find gadgets and LFENCEs, adding + // each interesting value to `Nodes` + auto AnalyzeDef = [&](NodeAddr SourceDef) { + SmallSet UsesVisited, DefsVisited; + std::function)> AnalyzeDefUseChain = + [&](NodeAddr Def) { + if (Transmitters.find(Def.Id) != Transmitters.end()) + return; // Already analyzed `Def` + + // Use RDF to find all the uses of `Def` + rdf::NodeSet Uses; + RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG)); + for (auto UseID : L.getAllReachedUses(DefReg, Def)) { + auto Use = DFG.addr(UseID); + if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node + NodeAddr Phi = Use.Addr->getOwner(DFG); + for (auto I : L.getRealUses(Phi.Id)) { + if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) { + for (auto UA : I.second) + Uses.emplace(UA.first); + } + } + } else { // not a phi node + Uses.emplace(UseID); + } + } + + // For each use of `Def`, we want to know whether: + // (1) The use can leak the Def'ed value, + // (2) The use can further propagate the Def'ed value to more defs + for (auto UseID : Uses) { + if (!UsesVisited.insert(UseID).second) + continue; // Already visited this use of `Def` + + auto Use = DFG.addr(UseID); + assert(!(Use.Addr->getFlags() & NodeAttrs::PhiRef)); + MachineOperand &UseMO = Use.Addr->getOp(); + MachineInstr &UseMI = *UseMO.getParent(); + assert(UseMO.isReg()); + + // We naively assume that an instruction propagates any loaded + // uses to all defs unless the instruction is a call, in which + // case all arguments will be treated as gadget sources during + // analysis of the callee function. + if (UseMI.isCall()) + continue; + + // Check whether this use can transmit (leak) its value. + if (instrUsesRegToAccessMemory(UseMI, UseMO.getReg()) || + (!NoConditionalBranches && + instrUsesRegToBranch(UseMI, UseMO.getReg()))) { + Transmitters[Def.Id].push_back(Use.Addr->getOwner(DFG).Id); + if (UseMI.mayLoad()) + continue; // Found a transmitting load -- no need to continue + // traversing its defs (i.e., this load will become + // a new gadget source anyways). + } + + // Check whether the use propagates to more defs. + NodeAddr Owner{Use.Addr->getOwner(DFG)}; + rdf::NodeList AnalyzedChildDefs; + for (auto &ChildDef : + Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) { + if (!DefsVisited.insert(ChildDef.Id).second) + continue; // Already visited this def + if (Def.Addr->getAttrs() & NodeAttrs::Dead) + continue; + if (Def.Id == ChildDef.Id) + continue; // `Def` uses itself (e.g., increment loop counter) + + AnalyzeDefUseChain(ChildDef); + + // `Def` inherits all of its child defs' transmitters. + for (auto TransmitterId : Transmitters[ChildDef.Id]) + Transmitters[Def.Id].push_back(TransmitterId); + } + } + + // Note that this statement adds `Def.Id` to the map if no + // transmitters were found for `Def`. + auto &DefTransmitters = Transmitters[Def.Id]; + + // Remove duplicate transmitters + llvm::sort(DefTransmitters); + DefTransmitters.erase( + std::unique(DefTransmitters.begin(), DefTransmitters.end()), + DefTransmitters.end()); + }; + + // Find all of the transmitters + AnalyzeDefUseChain(SourceDef); + auto &SourceDefTransmitters = Transmitters[SourceDef.Id]; + if (SourceDefTransmitters.empty()) + return; // No transmitters for `SourceDef` + + MachineInstr *Source = SourceDef.Addr->getFlags() & NodeAttrs::PhiRef + ? MachineGadgetGraph::ArgNodeSentinel + : SourceDef.Addr->getOp().getParent(); + auto GadgetSource = MaybeAddNode(Source); + // Each transmitter is a sink for `SourceDef`. + for (auto TransmitterId : SourceDefTransmitters) { + MachineInstr *Sink = DFG.addr(TransmitterId).Addr->getCode(); + auto GadgetSink = MaybeAddNode(Sink); + // Add the gadget edge to the graph. + Builder.addEdge(MachineGadgetGraph::GadgetEdgeSentinel, + GadgetSource.first, GadgetSink.first); + ++GadgetCount; + } + }; + + LLVM_DEBUG(dbgs() << "Analyzing def-use chains to find gadgets\n"); + // Analyze function arguments + NodeAddr EntryBlock = DFG.getFunc().Addr->getEntryBlock(DFG); + for (NodeAddr ArgPhi : + EntryBlock.Addr->members_if(DataFlowGraph::IsPhi, DFG)) { + NodeList Defs = ArgPhi.Addr->members_if(DataFlowGraph::IsDef, DFG); + llvm::for_each(Defs, AnalyzeDef); + } + // Analyze every instruction in MF + for (NodeAddr BA : DFG.getFunc().Addr->members(DFG)) { + for (NodeAddr SA : + BA.Addr->members_if(DataFlowGraph::IsCode, DFG)) { + MachineInstr *MI = SA.Addr->getCode(); + if (isFence(MI)) { + MaybeAddNode(MI); + ++FenceCount; + } else if (MI->mayLoad()) { + NodeList Defs = SA.Addr->members_if(DataFlowGraph::IsDef, DFG); + llvm::for_each(Defs, AnalyzeDef); + } + } + } + LLVM_DEBUG(dbgs() << "Found " << FenceCount << " fences\n"); + LLVM_DEBUG(dbgs() << "Found " << GadgetCount << " gadgets\n"); + if (GadgetCount == 0) + return nullptr; + NumGadgets += GadgetCount; + + // Traverse CFG to build the rest of the graph + SmallSet BlocksVisited; + std::function TraverseCFG = + [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) { + unsigned LoopDepth = MLI.getLoopDepth(MBB); + if (!MBB->empty()) { + // Always add the first instruction in each block + auto NI = MBB->begin(); + auto BeginBB = MaybeAddNode(&*NI); + Builder.addEdge(ParentDepth, GI, BeginBB.first); + if (!BlocksVisited.insert(MBB).second) + return; + + // Add any instructions within the block that are gadget components + GI = BeginBB.first; + while (++NI != MBB->end()) { + auto Ref = NodeMap.find(&*NI); + if (Ref != NodeMap.end()) { + Builder.addEdge(LoopDepth, GI, Ref->getSecond()); + GI = Ref->getSecond(); + } + } + + // Always add the terminator instruction, if one exists + auto T = MBB->getFirstTerminator(); + if (T != MBB->end()) { + auto EndBB = MaybeAddNode(&*T); + if (EndBB.second) + Builder.addEdge(LoopDepth, GI, EndBB.first); + GI = EndBB.first; + } + } + for (MachineBasicBlock *Succ : MBB->successors()) + TraverseCFG(Succ, GI, LoopDepth); + }; + // ArgNodeSentinel is a pseudo-instruction that represents MF args in the + // GadgetGraph + GraphIter ArgNode = MaybeAddNode(MachineGadgetGraph::ArgNodeSentinel).first; + TraverseCFG(&MF.front(), ArgNode, 0); + std::unique_ptr G{Builder.get(FenceCount, GadgetCount)}; + LLVM_DEBUG(dbgs() << "Found " << G->nodes_size() << " nodes\n"); + return G; +} + +// Returns the number of remaining gadget edges that could not be eliminated +int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( + MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */, + MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const { + if (G.NumFences > 0) { + // Eliminate fences and CFG edges that ingress and egress the fence, as + // they are trivially mitigated. + for (const auto &E : G.edges()) { + const MachineGadgetGraph::Node *Dest = E.getDest(); + if (isFence(Dest->getValue())) { + ElimNodes.insert(*Dest); + ElimEdges.insert(E); + for (const auto &DE : Dest->edges()) + ElimEdges.insert(DE); + } + } + } + + // Find and eliminate gadget edges that have been mitigated. + int MitigatedGadgets = 0, RemainingGadgets = 0; + MachineGadgetGraph::NodeSet ReachableNodes{G}; + for (const auto &RootN : G.nodes()) { + if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge)) + continue; // skip this node if it isn't a gadget source + + // Find all of the nodes that are CFG-reachable from RootN using DFS + ReachableNodes.clear(); + std::function + FindReachableNodes = + [&](const MachineGadgetGraph::Node *N, bool FirstNode) { + if (!FirstNode) + ReachableNodes.insert(*N); + for (const auto &E : N->edges()) { + const MachineGadgetGraph::Node *Dest = E.getDest(); + if (MachineGadgetGraph::isCFGEdge(E) && + !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest)) + FindReachableNodes(Dest, false); + } + }; + FindReachableNodes(&RootN, true); + + // Any gadget whose sink is unreachable has been mitigated + for (const auto &E : RootN.edges()) { + if (MachineGadgetGraph::isGadgetEdge(E)) { + if (ReachableNodes.contains(*E.getDest())) { + // This gadget's sink is reachable + ++RemainingGadgets; + } else { // This gadget's sink is unreachable, and therefore mitigated + ++MitigatedGadgets; + ElimEdges.insert(E); + } + } + } + } + return RemainingGadgets; +} + +std::unique_ptr +X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges( + std::unique_ptr Graph) const { + MachineGadgetGraph::NodeSet ElimNodes{*Graph}; + MachineGadgetGraph::EdgeSet ElimEdges{*Graph}; + int RemainingGadgets = + elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes); + if (ElimEdges.empty() && ElimNodes.empty()) { + Graph->NumFences = 0; + Graph->NumGadgets = RemainingGadgets; + } else { + Graph = GraphBuilder::trim(*Graph, ElimNodes, ElimEdges, 0 /* NumFences */, + RemainingGadgets); + } + return Graph; +} + +int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin( + MachineFunction &MF, std::unique_ptr Graph) const { + int FencesInserted = 0; + + do { + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); + Graph = trimMitigatedEdges(std::move(Graph)); + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); + if (Graph->NumGadgets == 0) + break; + + LLVM_DEBUG(dbgs() << "Cutting edges...\n"); + EdgeSet CutEdges{*Graph}; + auto Nodes = std::make_unique(Graph->nodes_size() + + 1 /* terminator node */); + auto Edges = std::make_unique(Graph->edges_size()); + auto EdgeCuts = std::make_unique(Graph->edges_size()); + auto EdgeValues = std::make_unique(Graph->edges_size()); + for (const auto &N : Graph->nodes()) { + Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin()); + } + Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node + for (const auto &E : Graph->edges()) { + Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest()); + EdgeValues[Graph->getEdgeIndex(E)] = E.getValue(); + } + OptimizeCut(Nodes.get(), Graph->nodes_size(), Edges.get(), EdgeValues.get(), + EdgeCuts.get(), Graph->edges_size()); + for (int I = 0; I < Graph->edges_size(); ++I) + if (EdgeCuts[I]) + CutEdges.set(I); + LLVM_DEBUG(dbgs() << "Cutting edges... Done\n"); + LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n"); + + LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n"); + FencesInserted += insertFences(MF, *Graph, CutEdges); + LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n"); + LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n"); + + Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph}, + CutEdges); + } while (true); + + return FencesInserted; +} + +int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic( + MachineFunction &MF, std::unique_ptr Graph) const { + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); + Graph = trimMitigatedEdges(std::move(Graph)); + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); + if (Graph->NumGadgets == 0) + return 0; + + LLVM_DEBUG(dbgs() << "Cutting edges...\n"); + MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph}; + MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph}; + auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) { + return !ElimEdges.contains(E) && !CutEdges.contains(E) && + MachineGadgetGraph::isCFGEdge(E); + }; + auto IsGadgetEdge = [&ElimEdges, + &CutEdges](const MachineGadgetGraph::Edge &E) { + return !ElimEdges.contains(E) && !CutEdges.contains(E) && + MachineGadgetGraph::isGadgetEdge(E); + }; + + // FIXME: this is O(E^2), we could probably do better. + do { + // Find the cheapest CFG edge that will eliminate a gadget (by being + // egress from a SOURCE node or ingress to a SINK node), and cut it. + const MachineGadgetGraph::Edge *CheapestSoFar = nullptr; + + // First, collect all gadget source and sink nodes. + MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph}; + for (const auto &N : Graph->nodes()) { + if (ElimNodes.contains(N)) + continue; + for (const auto &E : N.edges()) { + if (IsGadgetEdge(E)) { + GadgetSources.insert(N); + GadgetSinks.insert(*E.getDest()); + } + } + } + + // Next, look for the cheapest CFG edge which, when cut, is guaranteed to + // mitigate at least one gadget by either: + // (a) being egress from a gadget source, or + // (b) being ingress to a gadget sink. + for (const auto &N : Graph->nodes()) { + if (ElimNodes.contains(N)) + continue; + for (const auto &E : N.edges()) { + if (IsCFGEdge(E)) { + if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) { + if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue()) + CheapestSoFar = &E; + } + } + } + } + + assert(CheapestSoFar && "Failed to cut an edge"); + CutEdges.insert(*CheapestSoFar); + ElimEdges.insert(*CheapestSoFar); + } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes)); + LLVM_DEBUG(dbgs() << "Cutting edges... Done\n"); + LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n"); + + LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n"); + int FencesInserted = insertFences(MF, *Graph, CutEdges); + LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n"); + LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n"); + + return FencesInserted; +} + +int X86LoadValueInjectionLoadHardeningPass::insertFences( + MachineFunction &MF, MachineGadgetGraph &G, + EdgeSet &CutEdges /* in, out */) const { + int FencesInserted = 0; + for (const auto &N : G.nodes()) { + for (const auto &E : N.edges()) { + if (CutEdges.contains(E)) { + MachineInstr *MI = N.getValue(), *Prev; + MachineBasicBlock *MBB; // Insert an LFENCE in this MBB + MachineBasicBlock::iterator InsertionPt; // ...at this point + if (MI == MachineGadgetGraph::ArgNodeSentinel) { + // insert LFENCE at beginning of entry block + MBB = &MF.front(); + InsertionPt = MBB->begin(); + Prev = nullptr; + } else if (MI->isBranch()) { // insert the LFENCE before the branch + MBB = MI->getParent(); + InsertionPt = MI; + Prev = MI->getPrevNode(); + // Remove all egress CFG edges from this branch because the inserted + // LFENCE prevents gadgets from crossing the branch. + for (const auto &E : N.edges()) { + if (MachineGadgetGraph::isCFGEdge(E)) + CutEdges.insert(E); + } + } else { // insert the LFENCE after the instruction + MBB = MI->getParent(); + InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end(); + Prev = InsertionPt == MBB->end() + ? (MBB->empty() ? nullptr : &MBB->back()) + : InsertionPt->getPrevNode(); + } + // Ensure this insertion is not redundant (two LFENCEs in sequence). + if ((InsertionPt == MBB->end() || !isFence(&*InsertionPt)) && + (!Prev || !isFence(Prev))) { + BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE)); + ++FencesInserted; + } + } + } + } + return FencesInserted; +} + +bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory( + const MachineInstr &MI, unsigned Reg) const { + if (!MI.mayLoadOrStore() || MI.getOpcode() == X86::MFENCE || + MI.getOpcode() == X86::SFENCE || MI.getOpcode() == X86::LFENCE) + return false; + + // FIXME: This does not handle pseudo loading instruction like TCRETURN* + const MCInstrDesc &Desc = MI.getDesc(); + int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemRefBeginIdx < 0) { + LLVM_DEBUG(dbgs() << "Warning: unable to obtain memory operand for loading " + "instruction:\n"; + MI.print(dbgs()); dbgs() << '\n';); + return false; + } + MemRefBeginIdx += X86II::getOperandBias(Desc); + + const MachineOperand &BaseMO = + MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg); + const MachineOperand &IndexMO = + MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg); + return (BaseMO.isReg() && BaseMO.getReg() != X86::NoRegister && + TRI->regsOverlap(BaseMO.getReg(), Reg)) || + (IndexMO.isReg() && IndexMO.getReg() != X86::NoRegister && + TRI->regsOverlap(IndexMO.getReg(), Reg)); +} + +bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch( + const MachineInstr &MI, unsigned Reg) const { + if (!MI.isConditionalBranch()) + return false; + for (const MachineOperand &Use : MI.uses()) + if (Use.isReg() && Use.getReg() == Reg) + return true; + return false; +} + +INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, + "X86 LVI load hardening", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, + "X86 LVI load hardening", false, false) + +FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() { + return new X86LoadValueInjectionLoadHardeningPass(); +} diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp new file mode 100644 index 00000000000000..9c36e86099f999 --- /dev/null +++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp @@ -0,0 +1,140 @@ +//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Description: Replaces every `ret` instruction with the sequence: +/// ``` +/// pop +/// lfence +/// jmp * +/// ``` +/// where `` is some available scratch register, according to the +/// calling convention of the function being mitigated. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; + +#define PASS_KEY "x86-lvi-ret" +#define DEBUG_TYPE PASS_KEY + +STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation"); +STATISTIC(NumFunctionsConsidered, "Number of functions analyzed"); +STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations " + "were deployed"); + +namespace { + +class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass { +public: + X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { + return "X86 Load Value Injection (LVI) Ret-Hardening"; + } + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; +}; + +} // end anonymous namespace + +char X86LoadValueInjectionRetHardeningPass::ID = 0; + +bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction( + MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName() + << " *****\n"); + const X86Subtarget *Subtarget = &MF.getSubtarget(); + if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit()) + return false; // FIXME: support 32-bit + + // Don't skip functions with the "optnone" attr but participate in opt-bisect. + const Function &F = MF.getFunction(); + if (!F.hasOptNone() && skipFunction(F)) + return false; + + ++NumFunctionsConsidered; + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + unsigned ClobberReg = X86::NoRegister; + std::bitset UnclobberableGR64s; + UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer + UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer + UnclobberableGR64s.set(X86::RAX); // used for function return + UnclobberableGR64s.set(X86::RDX); // used for function return + + // We can clobber any register allowed by the function's calling convention. + for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR) + UnclobberableGR64s.set(Reg); + for (auto &Reg : X86::GR64RegClass) { + if (!UnclobberableGR64s.test(Reg)) { + ClobberReg = Reg; + break; + } + } + + if (ClobberReg != X86::NoRegister) { + LLVM_DEBUG(dbgs() << "Selected register " + << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg) + << " to clobber\n"); + } else { + LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n"); + } + + bool Modified = false; + for (auto &MBB : MF) { + MachineInstr &MI = MBB.back(); + if (MI.getOpcode() != X86::RETQ) + continue; + + if (ClobberReg != X86::NoRegister) { + MBB.erase_instr(&MI); + BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r)) + .addReg(ClobberReg, RegState::Define) + .setMIFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r)) + .addReg(ClobberReg); + } else { + // In case there is no available scratch register, we can still read from + // RSP to assert that RSP points to a valid page. The write to RSP is + // also helpful because it verifies that the stack's write permissions + // are intact. + MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE)); + addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)), + X86::RSP, false, 0) + .addImm(0) + ->addRegisterDead(X86::EFLAGS, TRI); + } + + ++NumFences; + Modified = true; + } + + if (Modified) + ++NumFunctionsMitigated; + return Modified; +} + +INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY, + "X86 LVI ret hardener", false, false) + +FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() { + return new X86LoadValueInjectionRetHardeningPass(); +} diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 7f49c6e861d4bf..f5caaaae4d8401 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1220,8 +1220,8 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, break; case MachineOperand::MO_Register: // FIXME: Add retpoline support and remove this. - if (Subtarget->useRetpolineIndirectCalls()) - report_fatal_error("Lowering register statepoints with retpoline not " + if (Subtarget->useIndirectThunkCalls()) + report_fatal_error("Lowering register statepoints with thunks not " "yet implemented."); CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); CallOpcode = X86::CALL64r; @@ -1399,9 +1399,9 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, EmitAndCountInstruction( MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); // FIXME: Add retpoline support and remove this. - if (Subtarget->useRetpolineIndirectCalls()) + if (Subtarget->useIndirectThunkCalls()) report_fatal_error( - "Lowering patchpoint with retpoline not yet implemented."); + "Lowering patchpoint with thunks not yet implemented."); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } diff --git a/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/llvm/lib/Target/X86/X86RetpolineThunks.cpp deleted file mode 100644 index 9085d7f068ac4e..00000000000000 --- a/llvm/lib/Target/X86/X86RetpolineThunks.cpp +++ /dev/null @@ -1,286 +0,0 @@ -//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// -/// Pass that injects an MI thunk implementing a "retpoline". This is -/// a RET-implemented trampoline that is used to lower indirect calls in a way -/// that prevents speculation on some x86 processors and can be used to mitigate -/// security vulnerabilities due to targeted speculative execution and side -/// channels such as CVE-2017-5715. -/// -/// TODO(chandlerc): All of this code could use better comments and -/// documentation. -/// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "x86-retpoline-thunks" - -static const char ThunkNamePrefix[] = "__llvm_retpoline_"; -static const char R11ThunkName[] = "__llvm_retpoline_r11"; -static const char EAXThunkName[] = "__llvm_retpoline_eax"; -static const char ECXThunkName[] = "__llvm_retpoline_ecx"; -static const char EDXThunkName[] = "__llvm_retpoline_edx"; -static const char EDIThunkName[] = "__llvm_retpoline_edi"; - -namespace { -class X86RetpolineThunks : public MachineFunctionPass { -public: - static char ID; - - X86RetpolineThunks() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { return "X86 Retpoline Thunks"; } - - bool doInitialization(Module &M) override; - bool runOnMachineFunction(MachineFunction &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); - } - -private: - MachineModuleInfo *MMI = nullptr; - const TargetMachine *TM = nullptr; - bool Is64Bit = false; - const X86Subtarget *STI = nullptr; - const X86InstrInfo *TII = nullptr; - - bool InsertedThunks = false; - - void createThunkFunction(Module &M, StringRef Name); - void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); - void populateThunk(MachineFunction &MF, unsigned Reg); -}; - -} // end anonymous namespace - -FunctionPass *llvm::createX86RetpolineThunksPass() { - return new X86RetpolineThunks(); -} - -char X86RetpolineThunks::ID = 0; - -bool X86RetpolineThunks::doInitialization(Module &M) { - InsertedThunks = false; - return false; -} - -bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << getPassName() << '\n'); - - TM = &MF.getTarget();; - STI = &MF.getSubtarget(); - TII = STI->getInstrInfo(); - Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; - - MMI = &getAnalysis().getMMI(); - Module &M = const_cast(*MMI->getModule()); - - // If this function is not a thunk, check to see if we need to insert - // a thunk. - if (!MF.getName().startswith(ThunkNamePrefix)) { - // If we've already inserted a thunk, nothing else to do. - if (InsertedThunks) - return false; - - // Only add a thunk if one of the functions has the retpoline feature - // enabled in its subtarget, and doesn't enable external thunks. - // FIXME: Conditionalize on indirect calls so we don't emit a thunk when - // nothing will end up calling it. - // FIXME: It's a little silly to look at every function just to enumerate - // the subtargets, but eventually we'll want to look at them for indirect - // calls, so maybe this is OK. - if ((!STI->useRetpolineIndirectCalls() && - !STI->useRetpolineIndirectBranches()) || - STI->useRetpolineExternalThunk()) - return false; - - // Otherwise, we need to insert the thunk. - // WARNING: This is not really a well behaving thing to do in a function - // pass. We extract the module and insert a new function (and machine - // function) directly into the module. - if (Is64Bit) - createThunkFunction(M, R11ThunkName); - else - for (StringRef Name : - {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName}) - createThunkFunction(M, Name); - InsertedThunks = true; - return true; - } - - // If this *is* a thunk function, we need to populate it with the correct MI. - if (Is64Bit) { - assert(MF.getName() == "__llvm_retpoline_r11" && - "Should only have an r11 thunk on 64-bit targets"); - - // __llvm_retpoline_r11: - // callq .Lr11_call_target - // .Lr11_capture_spec: - // pause - // lfence - // jmp .Lr11_capture_spec - // .align 16 - // .Lr11_call_target: - // movq %r11, (%rsp) - // retq - populateThunk(MF, X86::R11); - } else { - // For 32-bit targets we need to emit a collection of thunks for various - // possible scratch registers as well as a fallback that uses EDI, which is - // normally callee saved. - // __llvm_retpoline_eax: - // calll .Leax_call_target - // .Leax_capture_spec: - // pause - // jmp .Leax_capture_spec - // .align 16 - // .Leax_call_target: - // movl %eax, (%esp) # Clobber return addr - // retl - // - // __llvm_retpoline_ecx: - // ... # Same setup - // movl %ecx, (%esp) - // retl - // - // __llvm_retpoline_edx: - // ... # Same setup - // movl %edx, (%esp) - // retl - // - // __llvm_retpoline_edi: - // ... # Same setup - // movl %edi, (%esp) - // retl - if (MF.getName() == EAXThunkName) - populateThunk(MF, X86::EAX); - else if (MF.getName() == ECXThunkName) - populateThunk(MF, X86::ECX); - else if (MF.getName() == EDXThunkName) - populateThunk(MF, X86::EDX); - else if (MF.getName() == EDIThunkName) - populateThunk(MF, X86::EDI); - else - llvm_unreachable("Invalid thunk name on x86-32!"); - } - - return true; -} - -void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) { - assert(Name.startswith(ThunkNamePrefix) && - "Created a thunk with an unexpected prefix!"); - - LLVMContext &Ctx = M.getContext(); - auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); - Function *F = - Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); - F->setVisibility(GlobalValue::HiddenVisibility); - F->setComdat(M.getOrInsertComdat(Name)); - - // Add Attributes so that we don't create a frame, unwind information, or - // inline. - AttrBuilder B; - B.addAttribute(llvm::Attribute::NoUnwind); - B.addAttribute(llvm::Attribute::Naked); - F->addAttributes(llvm::AttributeList::FunctionIndex, B); - - // Populate our function a bit so that we can verify. - BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); - IRBuilder<> Builder(Entry); - - Builder.CreateRetVoid(); - - // MachineFunctions/MachineBasicBlocks aren't created automatically for the - // IR-level constructs we already made. Create them and insert them into the - // module. - MachineFunction &MF = MMI->getOrCreateMachineFunction(*F); - MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry); - - // Insert EntryMBB into MF. It's not in the module until we do this. - MF.insert(MF.end(), EntryMBB); -} - -void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, - unsigned Reg) { - const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; - const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP; - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0) - .addReg(Reg); -} - -void X86RetpolineThunks::populateThunk(MachineFunction &MF, - unsigned Reg) { - // Set MF properties. We never use vregs... - MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); - - // Grab the entry MBB and erase any other blocks. O0 codegen appears to - // generate two bbs for the entry block. - MachineBasicBlock *Entry = &MF.front(); - Entry->clear(); - while (MF.size() > 1) - MF.erase(std::next(MF.begin())); - - MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); - MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); - MCSymbol *TargetSym = MF.getContext().createTempSymbol(); - MF.push_back(CaptureSpec); - MF.push_back(CallTarget); - - const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; - const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; - - Entry->addLiveIn(Reg); - BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym); - - // The MIR verifier thinks that the CALL in the entry block will fall through - // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is - // the successor, but the MIR verifier doesn't know how to cope with that. - Entry->addSuccessor(CaptureSpec); - - // In the capture loop for speculation, we want to stop the processor from - // speculating as fast as possible. On Intel processors, the PAUSE instruction - // will block speculation without consuming any execution resources. On AMD - // processors, the PAUSE instruction is (essentially) a nop, so we also use an - // LFENCE instruction which they have advised will stop speculation as well - // with minimal resource utilization. We still end the capture with a jump to - // form an infinite loop to fully guarantee that no matter what implementation - // of the x86 ISA, speculating this code path never escapes. - BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); - BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); - BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); - CaptureSpec->setHasAddressTaken(); - CaptureSpec->addSuccessor(CaptureSpec); - - CallTarget->addLiveIn(Reg); - CallTarget->setHasAddressTaken(); - CallTarget->setAlignment(Align(16)); - insertRegReturnAddrClobber(*CallTarget, Reg); - CallTarget->back().setPreInstrSymbol(MF, TargetSym); - BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); -} diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index f4e8d30328caa0..af5153243c8b42 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -421,6 +421,16 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// than emitting one inside the compiler. bool UseRetpolineExternalThunk = false; + /// Prevent generation of indirect call/branch instructions from memory, + /// and force all indirect call/branch instructions from a register to be + /// preceded by an LFENCE. Also decompose RET instructions into a + /// POP+LFENCE+JMP sequence. + bool UseLVIControlFlowIntegrity = false; + + /// Insert LFENCE instructions to prevent data speculatively injected into + /// loads from being used maliciously. + bool UseLVILoadHardening = false; + /// Use software floating point for code generation. bool UseSoftFloat = false; @@ -707,8 +717,21 @@ class X86Subtarget final : public X86GenSubtargetInfo { return UseRetpolineIndirectBranches; } bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } + + // These are generic getters that OR together all of the thunk types + // supported by the subtarget. Therefore useIndirectThunk*() will return true + // if any respective thunk feature is enabled. + bool useIndirectThunkCalls() const { + return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity(); + } + bool useIndirectThunkBranches() const { + return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity(); + } + bool preferMaskRegisters() const { return PreferMaskRegisters; } bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } + bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } + bool useLVILoadHardening() const { return UseLVILoadHardening; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } @@ -853,10 +876,10 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; - /// If we are using retpolines, we need to expand indirectbr to avoid it + /// If we are using indirect thunks, we need to expand indirectbr to avoid it /// lowering to an actual indirect jump. bool enableIndirectBrExpand() const override { - return useRetpolineIndirectBranches(); + return useIndirectThunkBranches(); } /// Enable the MachineScheduler pass for all X86 subtargets. diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 7176e46f07b159..680a52b543851a 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -82,6 +82,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); + initializeX86LoadValueInjectionLoadHardeningPassPass(PR); + initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); } @@ -496,6 +498,7 @@ void X86PassConfig::addMachineSSAOptimization() { void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); + addPass(createX86LoadValueInjectionLoadHardeningPass()); } void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } @@ -525,7 +528,7 @@ void X86PassConfig::addPreEmitPass2() { const Triple &TT = TM->getTargetTriple(); const MCAsmInfo *MAI = TM->getMCAsmInfo(); - addPass(createX86RetpolineThunksPass()); + addPass(createX86IndirectThunksPass()); // Insert extra int3 instructions after trailing call instructions to avoid // issues in the unwinder. @@ -542,6 +545,7 @@ void X86PassConfig::addPreEmitPass2() { // Identify valid longjmp targets for Windows Control Flow Guard. if (TT.isOSWindows()) addPass(createCFGuardLongjmpPass()); + addPass(createX86LoadValueInjectionRetHardeningPass()); } std::unique_ptr X86PassConfig::getCSEConfig() const { diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index d6f2fef13fec97..a99019941e84ef 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -55,6 +55,10 @@ ; CHECK-NEXT: Fast Register Allocator ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Dominance Frontier Construction +; CHECK-NEXT: X86 Load Value Injection (LVI) Load Hardening ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization @@ -71,8 +75,9 @@ ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis -; CHECK-NEXT: X86 Retpoline Thunks +; CHECK-NEXT: X86 Indirect Thunks ; CHECK-NEXT: Check CFA info and insert CFI instructions if needed +; CHECK-NEXT: X86 Load Value Injection (LVI) Ret-Hardening ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: X86 Assembly Printer diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll index c645bb63639a30..87ab9ea63b7382 100644 --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -138,9 +138,11 @@ ; CHECK-NEXT: Machine Loop Invariant Code Motion ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Dominance Frontier Construction +; CHECK-NEXT: X86 Load Value Injection (LVI) Load Hardening ; CHECK-NEXT: PostRA Machine Sink ; CHECK-NEXT: Machine Block Frequency Analysis -; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter @@ -180,8 +182,9 @@ ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis -; CHECK-NEXT: X86 Retpoline Thunks +; CHECK-NEXT: X86 Indirect Thunks ; CHECK-NEXT: Check CFA info and insert CFI instructions if needed +; CHECK-NEXT: X86 Load Value Injection (LVI) Ret-Hardening ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: X86 Assembly Printer diff --git a/llvm/test/CodeGen/X86/lvi-hardening-gadget-graph.ll b/llvm/test/CodeGen/X86/lvi-hardening-gadget-graph.ll new file mode 100644 index 00000000000000..ba2ce26142b5aa --- /dev/null +++ b/llvm/test/CodeGen/X86/lvi-hardening-gadget-graph.ll @@ -0,0 +1,129 @@ +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -x86-lvi-load-dot-verify -o %t < %s | FileCheck %s + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @test(i32* %untrusted_user_ptr, i32* %secret, i32 %secret_size) #0 { +entry: + %untrusted_user_ptr.addr = alloca i32*, align 8 + %secret.addr = alloca i32*, align 8 + %secret_size.addr = alloca i32, align 4 + %ret_val = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %untrusted_user_ptr, i32** %untrusted_user_ptr.addr, align 8 + store i32* %secret, i32** %secret.addr, align 8 + store i32 %secret_size, i32* %secret_size.addr, align 4 + store i32 0, i32* %ret_val, align 4 + call void @llvm.x86.sse2.lfence() + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %secret_size.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %cmp1 = icmp eq i32 %rem, 0 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %3 = load i32*, i32** %secret.addr, align 8 + %4 = load i32, i32* %ret_val, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 %idxprom + %5 = load i32, i32* %arrayidx, align 4 + %6 = load i32*, i32** %untrusted_user_ptr.addr, align 8 + store i32 %5, i32* %6, align 4 + br label %if.end + +if.else: ; preds = %for.body + %7 = load i32*, i32** %secret.addr, align 8 + %8 = load i32, i32* %ret_val, align 4 + %idxprom2 = sext i32 %8 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %idxprom2 + store i32 42, i32* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %9 = load i32*, i32** %untrusted_user_ptr.addr, align 8 + %10 = load i32, i32* %9, align 4 + store i32 %10, i32* %ret_val, align 4 + br label %for.inc + +for.inc: ; preds = %if.end + %11 = load i32, i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %12 = load i32, i32* %ret_val, align 4 + ret i32 %12 +} + +; CHECK: digraph "Speculative gadgets for \"test\" function" { +; CHECK-NEXT: label="Speculative gadgets for \"test\" function"; +; CHECK: Node0x{{[0-9a-f]+}} [shape=record,color = green,label="{LFENCE\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.4.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.i)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JCC_1 %bb.6, 13, implicit killed $eflags\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{CMP32rm killed renamable $eax, %stack.2.secret_size.addr, 1, $noreg, 0, $noreg, implicit-def $eflags :: (dereferenceable load 4 from %ir.secret_size.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.4.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.i)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JCC_1 %bb.4, 5, implicit killed $eflags\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.1.secret.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.secret.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm killed renamable $rax, 4, killed renamable $rcx, 0, $noreg :: (load 4 from %ir.arrayidx)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOVSX64rm32 %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOV64rm %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.untrusted_user_ptr.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV32mr killed renamable $rcx, 1, $noreg, 0, $noreg, killed renamable $eax :: (store 4 into %ir.6)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.1.secret.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.secret.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV32mi killed renamable $rax, 4, killed renamable $rcx, 0, $noreg, 42 :: (store 4 into %ir.arrayidx3)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOVSX64rm32 %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.untrusted_user_ptr.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm killed renamable $rax, 1, $noreg, 0, $noreg :: (load 4 from %ir.9)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,color = blue,label="{ARGS}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV64mr %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg, killed renamable $rdi :: (store 8 into %ir.untrusted_user_ptr.addr)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JMP_1 %bb.5\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JMP_1 %bb.1\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0]; +; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{RET 0, $eax\n}"]; +; CHECK-NEXT: } + +; Function Attrs: nounwind +declare void @llvm.x86.sse2.lfence() #1 + +attributes #0 = { "target-features"="+lvi-cfi" + "target-features"="+lvi-load-hardening" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll b/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll new file mode 100644 index 00000000000000..d2caf6e1e9eb0c --- /dev/null +++ b/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll @@ -0,0 +1,281 @@ +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi < %s | FileCheck %s --check-prefix=X64 +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi -O0 < %s | FileCheck %s --check-prefix=X64FAST +; +; Note that a lot of this code was lifted from retpoline.ll. + +declare void @bar(i32) + +; Test a simple indirect call and tail call. +define void @icall_reg(void (i32)* %fp, i32 %x) { +entry: + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + ret void +} + +; X64-LABEL: icall_reg: +; X64-DAG: movq %rdi, %[[fp:[^ ]*]] +; X64-DAG: movl %esi, %[[x:[^ ]*]] +; X64: movl %esi, %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: callq __llvm_lvi_thunk_r11 +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL + +; X64FAST-LABEL: icall_reg: +; X64FAST: callq bar +; X64FAST: callq __llvm_lvi_thunk_r11 +; X64FAST: callq bar +; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL + + +@global_fp = external global void (i32)* + +; Test an indirect call through a global variable. +define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { + %fp1 = load void (i32)*, void (i32)** @global_fp + call void %fp1(i32 %x) + %fp2 = load void (i32)*, void (i32)** @global_fp + tail call void %fp2(i32 %x) + ret void +} + +; X64-LABEL: icall_global_fp: +; X64-DAG: movl %edi, %[[x:[^ ]*]] +; X64-DAG: movq global_fp(%rip), %r11 +; X64: callq __llvm_lvi_thunk_r11 +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq global_fp(%rip), %r11 +; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL + +; X64FAST-LABEL: icall_global_fp: +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: callq __llvm_lvi_thunk_r11 +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL + + +%struct.Foo = type { void (%struct.Foo*)** } + +; Test an indirect call through a vtable. +define void @vcall(%struct.Foo* %obj) #0 { + %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0 + %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field + %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1 + %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot + tail call void %fp(%struct.Foo* %obj) + tail call void %fp(%struct.Foo* %obj) + ret void +} + +; X64-LABEL: vcall: +; X64: movq %rdi, %[[obj:[^ ]*]] +; X64: movq (%rdi), %[[vptr:[^ ]*]] +; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] +; X64: movq %[[fp]], %r11 +; X64: callq __llvm_lvi_thunk_r11 +; X64-DAG: movq %[[obj]], %rdi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL + +; X64FAST-LABEL: vcall: +; X64FAST: callq __llvm_lvi_thunk_r11 +; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL + + +declare void @direct_callee() + +define void @direct_tail() #0 { + tail call void @direct_callee() + ret void +} + +; X64-LABEL: direct_tail: +; X64: jmp direct_callee # TAILCALL +; X64FAST-LABEL: direct_tail: +; X64FAST: jmp direct_callee # TAILCALL + + +declare void @nonlazybind_callee() #1 + +define void @nonlazybind_caller() #0 { + call void @nonlazybind_callee() + tail call void @nonlazybind_callee() + ret void +} + +; X64-LABEL: nonlazybind_caller: +; X64: movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]] +; X64: movq %[[REG]], %r11 +; X64: callq __llvm_lvi_thunk_r11 +; X64: movq %[[REG]], %r11 +; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL +; X64FAST-LABEL: nonlazybind_caller: +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: callq __llvm_lvi_thunk_r11 +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL + + +; Check that a switch gets lowered using a jump table +define void @switch_jumptable(i32* %ptr, i64* %sink) #0 { +; X64-LABEL: switch_jumptable: +; X64_NOT: jmpq * +entry: + br label %header + +header: + %i = load volatile i32, i32* %ptr + switch i32 %i, label %bb0 [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb5 + i32 6, label %bb6 + i32 7, label %bb7 + i32 8, label %bb8 + i32 9, label %bb9 + ] + +bb0: + store volatile i64 0, i64* %sink + br label %header + +bb1: + store volatile i64 1, i64* %sink + br label %header + +bb2: + store volatile i64 2, i64* %sink + br label %header + +bb3: + store volatile i64 3, i64* %sink + br label %header + +bb4: + store volatile i64 4, i64* %sink + br label %header + +bb5: + store volatile i64 5, i64* %sink + br label %header + +bb6: + store volatile i64 6, i64* %sink + br label %header + +bb7: + store volatile i64 7, i64* %sink + br label %header + +bb8: + store volatile i64 8, i64* %sink + br label %header + +bb9: + store volatile i64 9, i64* %sink + br label %header +} + + +@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0), + i8* blockaddress(@indirectbr_rewrite, %bb1), + i8* blockaddress(@indirectbr_rewrite, %bb2), + i8* blockaddress(@indirectbr_rewrite, %bb3), + i8* blockaddress(@indirectbr_rewrite, %bb4), + i8* blockaddress(@indirectbr_rewrite, %bb5), + i8* blockaddress(@indirectbr_rewrite, %bb6), + i8* blockaddress(@indirectbr_rewrite, %bb7), + i8* blockaddress(@indirectbr_rewrite, %bb8), + i8* blockaddress(@indirectbr_rewrite, %bb9)] + +; Check that when thunks are enabled the indirectbr instruction gets +; rewritten to use switch, and that in turn doesn't get lowered as a jump +; table. +define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 { +; X64-LABEL: indirectbr_rewrite: +; X64-NOT: jmpq * +entry: + %i0 = load i64, i64* %p + %target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0 + %target0 = load i8*, i8** %target.i0 + indirectbr i8* %target0, [label %bb1, label %bb3] + +bb0: + store volatile i64 0, i64* %sink + br label %latch + +bb1: + store volatile i64 1, i64* %sink + br label %latch + +bb2: + store volatile i64 2, i64* %sink + br label %latch + +bb3: + store volatile i64 3, i64* %sink + br label %latch + +bb4: + store volatile i64 4, i64* %sink + br label %latch + +bb5: + store volatile i64 5, i64* %sink + br label %latch + +bb6: + store volatile i64 6, i64* %sink + br label %latch + +bb7: + store volatile i64 7, i64* %sink + br label %latch + +bb8: + store volatile i64 8, i64* %sink + br label %latch + +bb9: + store volatile i64 9, i64* %sink + br label %latch + +latch: + %i.next = load i64, i64* %p + %target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next + %target.next = load i8*, i8** %target.i.next + ; Potentially hit a full 10 successors here so that even if we rewrite as + ; a switch it will try to be lowered with a jump table. + indirectbr i8* %target.next, [label %bb0, + label %bb1, + label %bb2, + label %bb3, + label %bb4, + label %bb5, + label %bb6, + label %bb7, + label %bb8, + label %bb9] +} + +; Lastly check that the necessary thunks were emitted. +; +; X64-LABEL: .section .text.__llvm_lvi_thunk_r11,{{.*}},__llvm_lvi_thunk_r11,comdat +; X64-NEXT: .hidden __llvm_lvi_thunk_r11 +; X64-NEXT: .weak __llvm_lvi_thunk_r11 +; X64: __llvm_lvi_thunk_r11: +; X64-NEXT: # {{.*}} # %entry +; X64-NEXT: lfence +; X64-NEXT: jmpq *%r11 + +attributes #1 = { nonlazybind } diff --git a/llvm/test/CodeGen/X86/lvi-hardening-inline-asm.ll b/llvm/test/CodeGen/X86/lvi-hardening-inline-asm.ll new file mode 100644 index 00000000000000..2b3ba2b30d4b7a --- /dev/null +++ b/llvm/test/CodeGen/X86/lvi-hardening-inline-asm.ll @@ -0,0 +1,156 @@ +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-load-hardening -mattr=+lvi-cfi -x86-experimental-lvi-inline-asm-hardening < %s -o %t.out 2> %t.err +; RUN: FileCheck %s --check-prefix=X86 < %t.out +; RUN: FileCheck %s --check-prefix=WARN < %t.err + +; Test module-level assembly +module asm "pop %rbx" +module asm "ret" +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: ret +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @test_inline_asm() { +entry: +; X86-LABEL: test_inline_asm: + call void asm sideeffect "mov 0x3fed(%rip),%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: movq 16365(%rip), %rax +; X86-NEXT: lfence + call void asm sideeffect "movdqa 0x0(%rip),%xmm0", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: movdqa (%rip), %xmm0 +; X86-NEXT: lfence + call void asm sideeffect "movslq 0x3e5d(%rip),%rbx", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: movslq 15965(%rip), %rbx +; X86-NEXT: lfence + call void asm sideeffect "mov (%r12,%rax,8),%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: movq (%r12,%rax,8), %rax +; X86-NEXT: lfence + call void asm sideeffect "movq (24)(%rsi), %r11", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: movq 24(%rsi), %r11 +; X86-NEXT: lfence + call void asm sideeffect "cmove %r12,%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: cmoveq %r12, %rax +; X86-NOT: lfence + call void asm sideeffect "cmove (%r12),%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: cmoveq (%r12), %rax +; X86-NEXT: lfence + call void asm sideeffect "pop %rbx", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: popq %rbx +; X86-NEXT: lfence + call void asm sideeffect "popq %rbx", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: popq %rbx +; X86-NEXT: lfence + call void asm sideeffect "xchg (%r12),%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: xchgq %rax, (%r12) +; X86-NEXT: lfence + call void asm sideeffect "cmpxchg %r12,(%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: cmpxchgq %r12, (%rax) +; X86-NEXT: lfence + call void asm sideeffect "vpxor (%rcx,%rdx,1),%ymm1,%ymm0", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: vpxor (%rcx,%rdx), %ymm1, %ymm0 +; X86-NEXT: lfence + call void asm sideeffect "vpmuludq 0x20(%rsi),%ymm0,%ymm12", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: vpmuludq 32(%rsi), %ymm0, %ymm12 +; X86-NEXT: lfence + call void asm sideeffect "vpexpandq 0x40(%rdi),%zmm8{%k2}{z}", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: vpexpandq 64(%rdi), %zmm8 {%k2} {z} +; X86-NEXT: lfence + call void asm sideeffect "addq (%r12),%rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: addq (%r12), %rax +; X86-NEXT: lfence + call void asm sideeffect "subq Lpoly+0(%rip), %rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: subq Lpoly+0(%rip), %rax +; X86-NEXT: lfence + call void asm sideeffect "adcq %r12,(%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: adcq %r12, (%rax) +; X86-NEXT: lfence + call void asm sideeffect "negq (%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: negq (%rax) +; X86-NEXT: lfence + call void asm sideeffect "incq %rax", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: incq %rax +; X86-NOT: lfence + call void asm sideeffect "mulq (%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: mulq (%rax) +; X86-NEXT: lfence + call void asm sideeffect "imulq (%rax),%rdx", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: imulq (%rax), %rdx +; X86-NEXT: lfence + call void asm sideeffect "shlq $$1,(%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: shlq (%rax) +; X86-NEXT: lfence + call void asm sideeffect "shrq $$1,(%rax)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: shrq (%rax) +; X86-NEXT: lfence + call void asm sideeffect "repz cmpsb %es:(%rdi),%ds:(%rsi)", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: repz cmpsb %es:(%rdi),%ds:(%rsi) +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: rep cmpsb %es:(%rdi), %ds:(%rsi) +; X86-NOT: lfence + call void asm sideeffect "repnz scasb", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: repnz scasb +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: repne scasb %es:(%rdi), %al +; X86-NOT: lfence + call void asm sideeffect "repnz", ""() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: repnz +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information + call void asm sideeffect "pinsrw $$0x6,(%eax),%xmm0", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: pinsrw $6, (%eax), %xmm0 +; X86-NEXT: lfence + call void asm sideeffect "ret", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: ret +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: retq +; X86-NOT: lfence + call void asm sideeffect "ret $$8", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: ret $8 +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: retq $8 +; X86-NOT: lfence + call void asm sideeffect "jmpq *(%rdx)", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: jmpq *(%rdx) +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: jmpq *(%rdx) +; X86-NOT: lfence + call void asm sideeffect "jmpq *0x100(%rdx)", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: jmpq *0x100(%rdx) +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: jmpq *256(%rdx) +; X86-NOT: lfence + call void asm sideeffect "callq *200(%rdx)", "~{dirflag},~{fpsr},~{flags}"() #1 +; WARN: warning: Instruction may be vulnerable to LVI +; WARN-NEXT: callq *200(%rdx) +; WARN-NEXT: ^ +; WARN-NEXT: note: See https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions for more information +; X86: callq *200(%rdx) +; X86-NOT: lfence + call void asm sideeffect "fldt 0x8(%rbp)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: fldt 8(%rbp) +; X86-NEXT: lfence + call void asm sideeffect "fld %st(0)", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: fld %st(0) +; X86-NOT: lfence +; Test assembler macros + call void asm sideeffect ".macro mplus1 x\0Aincq (\5Cx)\0A.endm\0Amplus1 %rcx", "~{dirflag},~{fpsr},~{flags}"() #1 +; X86: incq (%rcx) +; X86-NEXT: lfence + ret void +} + +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll new file mode 100644 index 00000000000000..3149d5b53c472e --- /dev/null +++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll @@ -0,0 +1,98 @@ +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-ALL +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @test(i32** %secret, i32 %secret_size) #0 { +; X64-LABEL: test: +entry: + %secret.addr = alloca i32**, align 8 + %secret_size.addr = alloca i32, align 4 + %ret_val = alloca i32, align 4 + %i = alloca i32, align 4 + store i32** %secret, i32*** %secret.addr, align 8 + store i32 %secret_size, i32* %secret_size.addr, align 4 + store i32 0, i32* %ret_val, align 4 + call void @llvm.x86.sse2.lfence() + store i32 0, i32* %i, align 4 + br label %for.cond + +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: lfence +; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: jmp .LBB0_1 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %secret_size.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +; X64: .LBB0_1: # %for.cond +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-ALL-NEXT: lfence +; X64-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax +; X64-ALL-NEXT: lfence +; X64-NEXT: jge .LBB0_5 + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %cmp1 = icmp eq i32 %rem, 0 + br i1 %cmp1, label %if.then, label %if.end + +; X64: # %bb.2: # %for.body +; X64-NEXT: # in Loop: Header=BB0_1 Depth=1 +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-ALL-NEXT: lfence +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: shrl $31, %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: andl $-2, %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: jne .LBB0_4 + +if.then: ; preds = %for.body + %3 = load i32**, i32*** %secret.addr, align 8 + %4 = load i32, i32* %ret_val, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i32*, i32** %3, i64 %idxprom + %5 = load i32*, i32** %arrayidx, align 8 + %6 = load i32, i32* %5, align 4 + store i32 %6, i32* %ret_val, align 4 + br label %if.end + +; X64: # %bb.3: # %if.then +; X64-NEXT: # in Loop: Header=BB0_1 Depth=1 +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: lfence +; X64-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx +; X64-NEXT: lfence +; X64-NEXT: movq (%rax,%rcx,8), %rax +; X64-NEXT: lfence +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: jmp .LBB0_4 + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %7 = load i32, i32* %i, align 4 + %inc = add nsw i32 %7, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %8 = load i32, i32* %ret_val, align 4 + ret i32 %8 +} + +; Function Attrs: nounwind +declare void @llvm.x86.sse2.lfence() #1 + +attributes #0 = { "target-features"="+lvi-load-hardening" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/lvi-hardening-ret.ll b/llvm/test/CodeGen/X86/lvi-hardening-ret.ll new file mode 100644 index 00000000000000..9f2b028b303445 --- /dev/null +++ b/llvm/test/CodeGen/X86/lvi-hardening-ret.ll @@ -0,0 +1,72 @@ +; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s + +define dso_local void @one_instruction() #0 { +; CHECK-LABEL: one_instruction: +entry: + ret void +; CHECK-NOT: retq +; CHECK: popq %[[x:[^ ]*]] +; CHECK-NEXT: lfence +; CHECK-NEXT: jmpq *%[[x]] +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @ordinary_function(i32 %x, i32 %y) #0 { +; CHECK-LABEL: ordinary_function: +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %y.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +; CHECK-NOT: retq +; CHECK: popq %[[x:[^ ]*]] +; CHECK-NEXT: lfence +; CHECK-NEXT: jmpq *%[[x]] +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @no_caller_saved_registers_function(i32 %x, i32 %y) #1 { +; CHECK-LABEL: no_caller_saved_registers_function: +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4 + %1 = load i32, i32* %y.addr, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +; CHECK-NOT: retq +; CHECK: shlq $0, (%{{[^ ]*}}) +; CHECK-NEXT: lfence +; CHECK-NEXT: retq +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local preserve_mostcc void @preserve_most() #0 { +; CHECK-LABEL: preserve_most: +entry: + ret void +; CHECK-NOT: retq +; CHECK: popq %r11 +; CHECK-NEXT: lfence +; CHECK-NEXT: jmpq *%r11 +} + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local preserve_allcc void @preserve_all() #0 { +; CHECK-LABEL: preserve_all: +entry: + ret void +; CHECK-NOT: retq +; CHECK: popq %r11 +; CHECK-NEXT: lfence +; CHECK-NEXT: jmpq *%r11 +} + +attributes #0 = { "target-features"="+lvi-cfi" } +attributes #1 = { "no_caller_saved_registers" "target-features"="+lvi-cfi" }