diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 67547c376790..21bfe021dd1a 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -347,6 +347,7 @@ struct MissingFeatures {
   static bool undef() { return false; }
   static bool noFPClass() { return false; }
   static bool llvmIntrinsicElementTypeSupport() { return false; }
+  static bool argHasMaybeUndefAttr() { return false; }
 
   //-- Missing parts of the CIRGenModule::Release skeleton.
   static bool emitModuleInitializers() { return false; }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
index 023e3baf2105..59420c1d2b54 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
@@ -612,7 +612,7 @@ llvm::LogicalResult LowerFunction::buildFunctionProlog(
                 Ptr.getLoc(), PointerType::get(STy, ptrType.getAddrSpace()),
                 CastKind::bitcast, Ptr);
           } else {
-            cir_cconv_unreachable("NYI");
+            addrToStoreInto = createTmpAlloca(*this, Ptr.getLoc(), STy);
           }
 
           assert(STy.getNumElements() == NumIRArgs);
@@ -628,7 +628,7 @@ llvm::LogicalResult LowerFunction::buildFunctionProlog(
           }
 
           if (srcSize > dstSize) {
-            cir_cconv_unreachable("NYI");
+            createMemCpy(*this, Ptr, addrToStoreInto, dstSize);
           }
         }
       } else {
@@ -1126,9 +1126,47 @@ mlir::Value LowerFunction::rewriteCallOp(const LowerFunctionInfo &CallInfo,
 
       // Fast-isel and the optimizer generally like scalar values better than
       // FCAs, so we flatten them if this is safe to do for this argument.
+      // As an example, if we have SrcTy = struct { i32, i32, i32 }, then the
+      // coerced type can be STy = struct { u64, i32 }. Hence a function with
+      // a single argument SrcTy will be rewritten to take two arguments,
+      // namely u64 and i32.
       StructType STy = mlir::dyn_cast<StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && ArgInfo.getCanBeFlattened()) {
-        cir_cconv_unreachable("NYI");
+        mlir::Type SrcTy = Src.getType();
+        llvm::TypeSize SrcTypeSize = LM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = LM.getDataLayout().getTypeAllocSize(STy);
+
+        if (SrcTypeSize.isScalable()) {
+          cir_cconv_unreachable("NYI");
+        } else {
+          size_t SrcSize = SrcTypeSize.getFixedValue();
+          size_t DstSize = DstTypeSize.getFixedValue();
+
+          // Create a new temporary space and copy src in the front bits of it.
+          // Other bits will be left untouched.
+          // Note in OG, Src is of type Address, while here it is mlir::Value.
+          // Here we need to first create another alloca to convert it into a
+          // PointerType, so that we can call memcpy.
+          if (SrcSize < DstSize) {
+            auto Alloca = createTmpAlloca(*this, loc, STy);
+            auto SrcAlloca = createTmpAlloca(*this, loc, SrcTy);
+            rewriter.create<cir::StoreOp>(loc, Src, SrcAlloca);
+            createMemCpy(*this, Alloca, SrcAlloca, SrcSize);
+            Src = Alloca;
+          } else {
+            cir_cconv_unreachable("NYI");
+          }
+
+          assert(NumIRArgs == STy.getNumElements());
+          for (unsigned I = 0; I != STy.getNumElements(); ++I) {
+            mlir::Value Member = rewriter.create<cir::GetMemberOp>(
+                loc, PointerType::get(STy.getMembers()[I]), Src, /*name=*/"",
+                /*index=*/I);
+            mlir::Value Load = rewriter.create<cir::LoadOp>(loc, Member);
+            cir_cconv_assert(!cir::MissingFeatures::argHasMaybeUndefAttr());
+            IRCallArgs[FirstIRArg + I] = Load;
+          }
+        }
       } else {
         // In the simple case, just pass the coerced loaded value.
         cir_cconv_assert(NumIRArgs == 1);
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
index 6e4856c42482..8d769808de70 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -182,6 +182,8 @@ void X86_64ABIInfo::classify(mlir::Type Ty, uint64_t OffsetBase, Class &Lo,
       return;
     } else if (mlir::isa<BoolType>(Ty)) {
       Current = Class::Integer;
+    } else if (mlir::isa<PointerType>(Ty)) {
+      Current = Class::Integer;
     } else if (const auto RT = mlir::dyn_cast<StructType>(Ty)) {
       uint64_t Size = getContext().getTypeSize(Ty);
 
@@ -397,7 +399,11 @@ mlir::Type X86_64ABIInfo::GetINTEGERTypeAtOffset(mlir::Type DestTy,
   // returning an 8-byte unit starting with it. See if we can safely use it.
   if (IROffset == 0) {
     // Pointers and int64's always fill the 8-byte unit.
-    cir_cconv_assert(!mlir::isa<PointerType>(DestTy) && "Ptrs are NYI");
+    if (auto ptrTy = mlir::dyn_cast<PointerType>(DestTy)) {
+      if (ptrTy.getTypeSizeInBits(getDataLayout().layout, {}) == 64)
+        return DestTy;
+      cir_cconv_unreachable("NYI");
+    }
 
     // If we have a 1/2/4-byte integer, we can use it only if the rest of the
     // goodness in the source type is just tail padding.  This is allowed to
@@ -406,6 +412,10 @@ mlir::Type X86_64ABIInfo::GetINTEGERTypeAtOffset(mlir::Type DestTy,
     // have to do this analysis on the source type because we can't depend on
     // unions being lowered a specific way etc.
     if (auto intTy = mlir::dyn_cast<IntType>(DestTy)) {
+      // Pointers and int64's always fill the 8-byte unit.
+      if (intTy.getWidth() == 64)
+        return DestTy;
+
       if (intTy.getWidth() == 8 || intTy.getWidth() == 16 ||
           intTy.getWidth() == 32) {
         unsigned BitWidth = intTy.getWidth();
diff --git a/clang/test/CIR/CallConvLowering/x86_64/basic.cpp b/clang/test/CIR/CallConvLowering/x86_64/basic.cpp
index 5bef1d34f974..52014d301b9b 100644
--- a/clang/test/CIR/CallConvLowering/x86_64/basic.cpp
+++ b/clang/test/CIR/CallConvLowering/x86_64/basic.cpp
@@ -125,4 +125,84 @@ S1 s1(S1 arg) {
   // CHECK: %[[#V18:]] = cir.load %[[#V17]] : !cir.ptr<!u64i>, !u64i
   // CHECK: cir.return %[[#V18]] : !u64i
   return {1, 2};
-}
\ No newline at end of file
+}
+
+/// Test call conv lowering for flattened structs. ///
+
+struct S2 {
+  int x, y, z;
+};
+
+// COM: Function prologue
+
+// CHECK: cir.func @_Z2s22S2(%[[ARG0:[a-z0-9]+]]: !u64i {{.*}}, %[[ARG1:[a-z0-9]+]]: !s32i {{.*}}) -> !ty_anon_struct
+// CHECK: %[[#F0:]] = cir.alloca !ty_S2_, !cir.ptr<!ty_S2_>
+// CHECK: %[[#F1:]] = cir.alloca !ty_anon_struct, !cir.ptr<!ty_anon_struct>
+// CHECK: %[[#F2:]] = cir.get_member %[[#F1]][0]{{.*}} : !cir.ptr<!ty_anon_struct> -> !cir.ptr<!u64i>
+// CHECK: cir.store %[[ARG0]], %[[#F2]] : !u64i, !cir.ptr<!u64i>
+// CHECK: %[[#F3:]] = cir.get_member %[[#F1]][1]{{.*}} : !cir.ptr<!ty_anon_struct> -> !cir.ptr<!s32i>
+// CHECK: cir.store %[[ARG1]], %[[#F3]] : !s32i, !cir.ptr<!s32i>
+// CHECK: %[[#F4:]] = cir.cast(bitcast, %[[#F1]] : !cir.ptr<!ty_anon_struct>), !cir.ptr<!void>
+// CHECK: %[[#F5:]] = cir.cast(bitcast, %[[#F0]] : !cir.ptr<!ty_S2_>), !cir.ptr<!void>
+// CHECK: %[[#F6:]] = cir.const #cir.int<12> : !u64i
+// CHECK: cir.libc.memcpy %[[#F6]] bytes from %[[#F4]] to %[[#F5]]
+S2 s2(S2 arg) {
+  // CHECK: %[[#F7:]] = cir.alloca !ty_S2_, !cir.ptr<!ty_S2_>, ["__retval"] {alignment = 4 : i64}
+  // CHECK: %[[#F8:]] = cir.alloca !ty_S2_, !cir.ptr<!ty_S2_>, ["agg.tmp0"] {alignment = 4 : i64}
+  // CHECK: %[[#F9:]] = cir.alloca !ty_S2_, !cir.ptr<!ty_S2_>, ["agg.tmp1"] {alignment = 4 : i64}
+  // CHECK: %[[#F10:]] = cir.alloca !ty_anon_struct, !cir.ptr<!ty_anon_struct>, ["tmp"] {alignment = 8 : i64}
+  // CHECK: %[[#F11:]] = cir.alloca !ty_S2_, !cir.ptr<!ty_S2_>, ["tmp"] {alignment = 4 : i64}
+  // CHECK: %[[#F12:]] = cir.alloca !ty_anon_struct, !cir.ptr<!ty_anon_struct>, ["tmp"] {alignment = 8 : i64}
+  // CHECK: %[[#F13:]] = cir.alloca !ty_anon_struct, !cir.ptr<!ty_anon_struct>, ["tmp"] {alignment = 8 : i64}
+  
+  // COM: Construction of S2 { 1, 2, 3 }.
+
+  // CHECK: %[[#F14:]] = cir.get_member %[[#F8]][0] {{.*}} : !cir.ptr<!ty_S2_> -> !cir.ptr<!s32i>
+  // CHECK: %[[#F15:]] = cir.const #cir.int<1> : !s32i
+  // CHECK: cir.store %[[#F15]], %[[#F14]] : !s32i, !cir.ptr<!s32i>
+  // CHECK: %[[#F16:]] = cir.get_member %[[#F8]][1] {{.*}} : !cir.ptr<!ty_S2_> -> !cir.ptr<!s32i>
+  // CHECK: %[[#F17:]] = cir.const #cir.int<2> : !s32i
+  // CHECK: cir.store %[[#F17]], %[[#F16]] : !s32i, !cir.ptr<!s32i>
+  // CHECK: %[[#F18:]] = cir.get_member %[[#F8]][2] {{.*}} : !cir.ptr<!ty_S2_> -> !cir.ptr<!s32i>
+  // CHECK: %[[#F19:]] = cir.const #cir.int<3> : !s32i
+  // CHECK: cir.store %[[#F19]], %[[#F18]] : !s32i, !cir.ptr<!s32i>
+
+  // COM: Flattening of the struct.
+  // COM: { i32, i32, i32 } -> { i64, i32 }.
+
+  // CHECK: %[[#F20:]] = cir.load %[[#F8]] : !cir.ptr<!ty_S2_>, !ty_S2_
+  // CHECK: cir.store %[[#F20]], %[[#F11]] : !ty_S2_, !cir.ptr<!ty_S2_>
+  // CHECK: %[[#F21:]] = cir.cast(bitcast, %[[#F11]] : !cir.ptr<!ty_S2_>), !cir.ptr<!void>
+  // CHECK: %[[#F22:]] = cir.cast(bitcast, %[[#F10]] : !cir.ptr<!ty_anon_struct>), !cir.ptr<!void>
+  // CHECK: %[[#F23:]] = cir.const #cir.int<12> : !u64i
+  // CHECK: cir.libc.memcpy %[[#F23]] bytes from %[[#F21]] to %[[#F22]]
+
+  // COM: Function call.
+  // COM: Retrieve the two values in { i64, i32 }.
+
+  // CHECK: %[[#F24:]] = cir.get_member %[[#F10]][0] {name = ""} : !cir.ptr<!ty_anon_struct> -> !cir.ptr<!u64i>
+  // CHECK: %[[#F25:]] = cir.load %[[#F24]] : !cir.ptr<!u64i>, !u64i
+  // CHECK: %[[#F26:]] = cir.get_member %[[#F10]][1] {name = ""} : !cir.ptr<!ty_anon_struct> -> !cir.ptr<!s32i>
+  // CHECK: %[[#F27:]] = cir.load %[[#F26]] : !cir.ptr<!s32i>, !s32i
+  // CHECK: %[[#F28:]] = cir.call @_Z2s22S2(%[[#F25]], %[[#F27]]) : (!u64i, !s32i) -> !ty_anon_struct
+  // CHECK: cir.store %[[#F28]], %[[#F12]] : !ty_anon_struct, !cir.ptr<!ty_anon_struct>
+
+  // CHECK: %[[#F29:]] = cir.cast(bitcast, %[[#F12]] : !cir.ptr<!ty_anon_struct>), !cir.ptr<!void>
+  // CHECK: %[[#F30:]] = cir.cast(bitcast, %[[#F9]] : !cir.ptr<!ty_S2_>), !cir.ptr<!void>
+  // CHECK: %[[#F31:]] = cir.const #cir.int<12> : !u64i
+  // CHECK: cir.libc.memcpy %[[#F31]] bytes from %[[#F29]] to %[[#F30]]
+
+  // COM: Construct S2 { 1, 2, 3 } again.
+  // COM: It has been tested above, so no duplication here.
+
+  // COM: For return, the first two fields of S2 is also coerced.
+
+  // CHECK: %[[#F39:]] = cir.cast(bitcast, %[[#F7]] : !cir.ptr<!ty_S2_>), !cir.ptr<!void>
+  // CHECK: %[[#F40:]] = cir.cast(bitcast, %[[#F13]] : !cir.ptr<!ty_anon_struct>), !cir.ptr<!void>
+  // CHECK: %[[#F41:]] = cir.const #cir.int<12> : !u64i
+  // cir.libc.memcpy %[[#F41]] bytes from %[[#F39]] to %[[#F40]]
+  // CHECK: %[[#F42:]] = cir.load %[[#F13]] : !cir.ptr<!ty_anon_struct>, !ty_anon_struct
+  // cir.return %[[#F42]] : !ty_anon_struct
+  s2({ 1, 2, 3 });
+  return { 1, 2, 3 };
+}