Pharos PIC hash update, 2025.01.30.

This branch is a work in progress. It contains a major rewrite to fn2hash that has major backward incompatibilities with the original fn2hash. PIC hashing, in particular, has been completely rewritten. This is an unstable release, and future changes to this branch may also change results without warning. Fixes #267
cmu-sei · Jan 30, 2025 · 5b6ca4f · 5b6ca4f
1 parent 1a023b3
commit 5b6ca4f
Show file tree

Hide file tree

Showing 10 changed files with 1,267 additions and 987 deletions.
diff --git a/libpharos/funcs.cpp b/libpharos/funcs.cpp
diff --git a/libpharos/funcs.hpp b/libpharos/funcs.hpp
@@ -1,4 +1,4 @@
-// Copyright 2015-2021 Carnegie Mellon University.  See LICENSE file for terms.
+// Copyright 2015-2024 Carnegie Mellon University.  See LICENSE file for terms.
 
 #ifndef Pharos_Funcs_H
 #define Pharos_Funcs_H
@@ -82,35 +82,6 @@ class FunctionDescriptor : private Immobile {
   mutable shared_mutex mutex;
   mutable std::recursive_mutex pdg_mutex;
 
-  // only fn2hash really cares about these particular hashes, so instead of wasting memory
-  // keeping them in the FunctionDescriptor, we'll generate them only if explicitly requested
-  // by passing in this classs to populate, and return them in there:
-  class ExtraFunctionHashData {
-   public:
-    std::string mnemonics; // concatenated mnemonics
-    std::string mnemcats; // concatenated mnemonic categories
-
-    std::string mnemonic_hash; // variant of EHASH but only mnemonics (no operands) instead of insn bytes
-    std::string mnemonic_category_hash; // variant of PHASH but only mnemonic categories
-    std::string mnemonic_count_hash; // hash of the ordered mnemonic/count pairs
-    std::string mnemonic_category_count_hash; // hash of the orderend mnemcat/count pairs
-
-    std::map< std::string, uint32_t > mnemonic_counts;
-    std::map< std::string, uint32_t > mnemonic_category_counts;
-
-    std::vector< rose_addr_t > basic_block_addrs; // added in flow order (take len to get # bbs)
-    std::vector< std::pair< rose_addr_t, rose_addr_t > > cfg_edges; // from->to pairs of bb addrs (empty if only 1 bb?)
-    class BasicBlockHashData {
-     public:
-      //rose_addr_t addr; // eh, get addr from list above or map below
-      std::string pic;
-      std::string cpic;
-      std::vector< std::string > mnemonics; // in insn order (take len to see how many insn in bb)
-      std::vector< std::string > mnemonic_categories; // in insn order (take len to see how many insn in bb)
-    };
-    std::map< rose_addr_t, BasicBlockHashData > basic_block_hash_data;
-  };
-
  private:
 
   // The address of the function.  This can refer to address that does not yet have a function
@@ -184,15 +155,11 @@ class FunctionDescriptor : private Immobile {
   // by get_pic_bytes() or get_pic_hash() by compute_func_bytes().
   std::string pic_bytes;
   std::string pic_hash;
-  std::list< uint32_t > pic_offsets; // the offsets of the PICed out bytes, so Yara sigs can be generted w/ this data
-
-  std::string composite_pic_hash; // variant of PIC w/ no control flow insn, basic blocks hashed and func hashed by hashing those ordered hashes (ASCII values)...
+  std::vector<uint8_t> pic_mask; // A bitmask for the PIC'd bytes
 
   // might as well collect some fn level stats:
-  unsigned int num_blocks; // basic blocks, that is
-  unsigned int num_blocks_in_cfg;
-  unsigned int num_instructions;
-  unsigned int num_bytes;
+  std::uint64_t num_instructions;
+  std::uint64_t num_bytes;
 
   // This is set to true if we're pretty certain that we never return.  It's a little unclear
   // what level of semantic analysis we mean right now, but I think it would be fine currently
@@ -264,7 +231,6 @@ class FunctionDescriptor : private Immobile {
 
   CFG const & _get_rose_cfg() const;
 
-
   // Update the return value fields in the parameter list.  Only called while generating
   // the PDG.
   void update_return_values();
@@ -283,7 +249,9 @@ class FunctionDescriptor : private Immobile {
   // method.  compute_function_hashes() is const, but uses casting to call this instead.  This
   // is because compute_function_hashes() and the methods that depend on it are semantically
   // const, but defer calculation until needed.
-  void _compute_function_hashes(ExtraFunctionHashData *extra=NULL);
+  void _compute_function_hashes();
+  // A mutexless version of get_insns_addr_order for internal callers.
+  InsnVector _get_insns_addr_order() const;
 
   const PDG * _get_pdg();
 
@@ -444,9 +412,8 @@ class FunctionDescriptor : private Immobile {
     return stack_analysis_failures;
   }
 
-  // Compute the exact & PIC bytes & hashes simultaneously.  If extra pointer is not null,
-  // compute extra hash types and return in that struct.
-  void compute_function_hashes(ExtraFunctionHashData *extra=NULL) const;
+  // Compute the exact & PIC bytes & hashes simultaneously.
+  void compute_function_hashes() const;
 
   // Get the weighted PDG hash.
   std::string get_pdg_hash(unsigned int num_hash_funcs = 4);
@@ -461,14 +428,10 @@ class FunctionDescriptor : private Immobile {
   // with the non position independent parts replaced with zeros.  This is the value hashed to
   // produce the PIC hash.
   const std::string& get_pic_bytes() const;
-  const std::list< uint32_t > & get_pic_offsets() const; // which bytes in the PIC bytes were PICed out?
+  const std::vector<uint8_t>& get_pic_mask() const; // which bytes in the PIC bytes were PICed out?
   // Get the PIC hash.
   const std::string& get_pic_hash() const;
-  // Get the CPIC (tries to account for simple CFG changes):
-  const std::string& get_composite_pic_hash() const;
 
-  unsigned int get_num_blocks() const { return num_blocks; };
-  unsigned int get_num_blocks_in_cfg() const { return num_blocks_in_cfg; };
   unsigned int get_num_instructions() const { return num_instructions; };
   unsigned int get_num_bytes() const { return num_bytes; };
 

diff --git a/libpharos/misc.cpp b/libpharos/misc.cpp
@@ -322,6 +322,231 @@ boost::optional<rose_addr_t> insn_get_branch_target(SgAsmInstruction* insn) {
   return boost::none;
 }
 
+#define PICDEBUG GDEBUG
+
+// An AstProcessing class to identify integer offsets in instructions, for the purpose of
+// implementing the PIC algorithm.
+struct PICSearcher : public AstSimpleProcessing {
+  // Local copies of the construction parameters.
+  const DescriptorSet& ds;
+  const AddressIntervalSet* chunks;
+  const SgAsmInstruction *insn;
+  uint64_t min_addr_threshold;
+  // This is effectively the return value, and vector of pairs of bit offset and bit size.
+  std::vector< std::pair<uint32_t, uint32_t> > candidates;
+
+  PICSearcher(
+    const DescriptorSet& _ds,
+    const AddressIntervalSet& _chunks,
+    const SgAsmInstruction* _insn,
+    uint64_t _min_addr_threshold) : ds(_ds) {
+    chunks = &_chunks;
+    insn = _insn;
+    min_addr_threshold = _min_addr_threshold;
+  }
+
+  // Add a pair, <offset, size> in bits where a PIC'd address resides in the instruction to
+  // candidates vector.  The addr parameter is the absolute address that is being PIC'd, and
+  // the intexp parameter is the SgAsmIntegerValueExpression that represents that address in
+  // the instruction.
+  void handle_pic_offset(const rose_addr_t addr, const SgAsmIntegerValueExpression *intexp) {
+    // The primary test of whether the integer value should be PIC'd is whether the integer is
+    // really an address.  We determine this be checking whether the address is mapped into
+    // memory by the program or not.  If not, this integer is just a constant integer.
+    if (!ds.memory.is_mapped(rose_addr_t(addr))) {
+      PICDEBUG << "Instruction '" << debug_instruction(insn) << "' address 0x" << std::hex
+               << addr << std::dec << " was not PIC'd because it was not mapped." << LEND;
+      return;
+    }
+
+    // In programs mapped at base address zero, the memory map test will report that very small
+    // constants like 1, 4, and 8 are "addresses" that should be PIC'd out.  While this is a
+    // very unprincipled solution, I think it's better than incorrectly PIC'ing lots of small
+    // constants.  In the future we should test the program header for an image base address of
+    // zero and only enable this threshold if it applies?
+    if (addr < min_addr_threshold) {
+      PICDEBUG << "Instruction '" << debug_instruction(insn) << "' address 0x" << std::hex << addr
+               << std::dec << " was not PIC'd because it didn't meet minimum threshold." << LEND;
+      return;
+    }
+
+    // In general, when appying the PIC algorithm we do not want to PIC relative address
+    // references within the same chunk.  Chunks are contiguous blocks of memory assigned to
+    // the function.  A function may have more than one chunk if there's switch table data in
+    // the middle of the function for example, although most functions have one chunk.  This
+    // helps preserve local control flow, which of course makes the hash more rigid, but also
+    // more accurate (and easily to interpret when disassembled from the bytes).  In contrast,
+    // when a relative address references a different chunk, we do want to PIC that address
+    // because it's unlikely that the two chunks will be the same distance from each other in a
+    // separate compilation.  Chunks are also critically important to fn2yara signature
+    // generation where the chunk must be matched separately in contrasty to fn2hash results
+    // where the chunks are concatenated for the hash calculation.
+
+    // Chunk1 is the chunk that contains this instruction.
+    auto chunk1 = chunks->find(insn->get_address());
+    auto chunk2 = chunks->find(addr);
+
+    // If they're in the same chunk, we should NOT PIC this address... Unless the expression is
+    // literally an absolute address, in which case we should always PIC it.
+    if (chunk1 == chunk2 && addr != intexp->get_value()) {
+      PICDEBUG << "Instruction '" << debug_instruction(insn) << "' address 0x" << std::hex
+               << addr << " expr 0x" << intexp->get_value() << std::dec
+               << " was not PIC'd because it referenced the same chunk." << LEND;
+      return;
+    }
+
+    // Now for a few checks on the bit offsets and sizes returned by ROSE.  Sizes and offsets
+    // should be positive and byte aligned.  ROSE occasionally return unexpected values
+    // (e.g. size of zero) when it's difficult to determin which bits represent the operand.
+    // The byte alignment filtering is because our PIC algorithm currently only works at the
+    // byte level, but we could probably improve this to support nbybbles in the future.  If
+    // the size or offset is invalid, we're not going to PIC this address.
+    unsigned short off = intexp->get_bitOffset();
+    unsigned short sz = intexp->get_bitSize();
+    if (sz <= 0 || off <= 0 || sz % 8 != 0 || off % 8 != 0) {
+      GWARN << "Instruction '" << debug_instruction(insn)
+            << "' has suspicious PIC properties: size=" << sz << " offset=" << off << LEND;
+      return;
+    }
+
+    // If the instruction is not reasonably sized, we're not going to PIC this address.
+    unsigned short insnsz = insn->get_rawBytes().size();
+    if (insnsz > 17) {
+      GWARN << "Instruction '" << debug_instruction(insn)
+            << "' has a suspicious size of " << insnsz << " bytes." << LEND;
+      return;
+    }
+
+    // The size and the offset should both be contained within the instruction, and if they're
+    // not we're not going to PIC this address.
+    if ((off + sz) > insnsz * 8) {
+      GWARN << "Instruction '" << debug_instruction(insn)
+            << "' has an expression that doesn't fit in the instruction. size=" << insnsz
+            << " exp_size=" << sz << " exp_offset=" << off << LEND;
+      return;
+    }
+
+    PICDEBUG << "Instruction '" << debug_instruction(insn) << "' has a PIC'd address (0x"
+             << std::hex << addr << ") at exp_size=" << sz << " exp_offset=" << off << LEND;
+
+    // Conversion to a byte mask could occur right here...
+    candidates.push_back(std::pair<uint32_t, uint32_t>(off, sz));
+  }
+
+  void visit(SgNode *node) override {
+    // If this AST node is an SgAsmIntegerValueExpression, obviously we want to consider it for
+    // PIC'ing, because that's the primary type of expression that get's PIC'd.
+    const SgAsmIntegerValueExpression *intexp = isSgAsmIntegerValueExpression(node);
+    if (intexp) {
+      uint64_t val = intexp->get_value(); // or get_absoluteValue() ?
+      handle_pic_offset(val, intexp);
+      return;
+    }
+
+    // But we also want to handle memory reference operands of the form "[rip + offset]"
+    // specially.  Specifically, the RIP and offset should be combined to produce an absolute
+    // address so that we can correctly evaluate the address for PIC'ing.  The rest of this
+    // routine matches that pattern.
+
+    // If this node is not a memory reference expression, we don't match the pattern.
+    const SgAsmMemoryReferenceExpression *mem_expr = isSgAsmMemoryReferenceExpression(node);
+    if (!mem_expr) {
+      return;
+    }
+
+    // If the address is not an add expression, we don't match the pattern.
+    const SgAsmBinaryAdd* add_expr = isSgAsmBinaryAdd(mem_expr->get_address());
+    if (!add_expr) {
+      return;
+    }
+
+    // If the left hand side is not a register expression, we don't match the pattern.
+    const SgAsmDirectRegisterExpression* reg_expr = isSgAsmDirectRegisterExpression(add_expr->get_lhs());
+    if (!reg_expr) {
+      return;
+    }
+
+    // If the register is not the IP register, we don't match the pattern (a common case).
+    if (reg_expr->get_descriptor() != ds.get_ip_reg()) {
+      return;
+    }
+
+    // If the right hand side is not a integer expression, we don't match the pattern.
+    const SgAsmIntegerValueExpression* off_expr = isSgAsmIntegerValueExpression(add_expr->get_rhs());
+    if (!off_expr) {
+      return;
+    }
+
+    // In the future we might want to check whether the segment register is the data segment register?
+    //const SgAsmDirectRegisterExpression* seg_expr = isSgAsmDirectRegisterExpression(mem_expr->segment);
+    //if (seg_expr->get_descriptor() != ds.get_ds_reg()) { return; }
+
+    // We've matched the pattern, and the offset is a relative address from the current value
+    // of RIP.  We need convert the offset to an absolute address so that handle_pic_offset()
+    // can test whether the address is in define memory correctly.  To do this we add the
+    // current value of RIP (which will be the address of the instruction plus the size of the
+    // instruction) to the offset in the expression.
+    rose_addr_t absolute_addr = insn->get_address() + insn->get_size() + off_expr->get_value();
+    handle_pic_offset(absolute_addr, off_expr);
+
+    // Having handled this SgAsmMemoryReferenceExpression, we're done with this node.  We'll
+    // continue to visit the sub-expressions again including the SgAsmIntegerValueExpression
+    // for the offset, which is a little inefficient but causes no harm?
+    return;
+  }
+};
+
+// Determine which bytes should be masked off when PIC'ing an instruction.  The parameters are
+// the descriptor set that describes the program, the function chunks from
+// fd->get_address_intervals(), the instruction to be PIC'd and the minimum address threshold
+// to ensure that small constants don't get PIC'd even if the base address of the program is
+// zero.  The function chunks are contiguous blocks of memory assigned to the function.
+// Usually there will be one chunk per function, but sometimes there might be more (e.g. if
+// there's switch table data in the middle of the function).  Returns a vector of uint8_t where
+// the bits that should be PIC'd are set to zero.
+std::vector<uint8_t>
+pic_insn(const DescriptorSet &ds, const AddressIntervalSet& chunks, SgAsmInstruction* insn,
+         uint64_t min_addr_threshold)
+{
+  // The mask of which bits are being PIC'd.
+  std::vector<uint8_t> pic_mask(insn->get_rawBytes().size(), 0xff);
+
+  // Run the AST traversal to find the addresses that need to PIC'd.
+  PICSearcher searcher(ds, chunks, insn, min_addr_threshold);
+  searcher.traverse(insn, preorder);
+
+  // Convert PIC "candidates" from a list of bit offsets and sizes into a vector of mask bytes.
+  // This loop combines possibly overlapping bit ranges.
+  for (auto sc = searcher.candidates.begin(); sc != searcher.candidates.end(); ++sc) {
+    uint32_t off = sc->first;
+    uint32_t sz = sc->second;
+    uint32_t byte_offset = off / 8;
+    int bit_start = off % 8;
+    int bit_end = (off + sz) % 8;
+    uint8_t start_mask = uint8_t{0xff} << bit_start;
+    uint8_t end_mask = ~(uint8_t{0xff} << bit_end);
+    auto start_it = pic_mask.begin() + byte_offset;
+    if (byte_offset == (off + sz) / 8) {
+      // In this unlikely case, the bits are all in the middle of the same byte
+      *start_it &= ~(start_mask & end_mask);
+      continue;
+    }
+    if (bit_start) {
+      // Handle the range not starting at the beginning of a byte
+      *start_it &= ~start_mask;
+      ++start_it;
+      sz -= 8 - bit_start;
+    }
+    auto end_it = std::fill_n(start_it, sz / 8, 0x00);
+    if (bit_end) {
+      // Handle the range not ending at the end of a byte
+      *end_it &= ~end_mask;
+    }
+  }
+
+  return pic_mask;
+}
+
 std::string insn_get_generic_category(SgAsmInstruction *insn) {
   std::string result = "UNCAT"; // if we haven't singled something out yet...
   // this code horribly x86 specific for now, but later needs to be able to handle ARM, etc.

diff --git a/libpharos/misc.hpp b/libpharos/misc.hpp
@@ -299,6 +299,10 @@ rose_addr_t insn_get_jump_deref(SgAsmInstruction* insn);
 const SgAsmInstruction* last_insn_in_block(const SgAsmBlock* bb);
 const SgAsmX86Instruction* last_x86insn_in_block(const SgAsmBlock* bb);
 
+std::vector<uint8_t> pic_insn(
+  const DescriptorSet &ds, const AddressIntervalSet& chunks,
+  SgAsmInstruction* insn, uint64_t min_addr_threshold);
+
 // Returns a descriptive string for a generic category of instructions, sort of along the lines
 // of the breakdowns in the Intel manuals Vol 1 Ch 5 (mostly matching):
 //   TRANSFER (mov, push. xchg), ARITHMETIC (add, sub, lea), LOGIC (and, or, xor, shl, ror),