diff --git a/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt
index d9968222c..6bf43c709 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt
@@ -83,6 +83,9 @@ iree_cc_library(
     MLIREmitCDialect
     ::AIEVecDialectIR
     ::AIEVecXLLVMOpsGen
+    iree-amd-aie::aie_runtime::iree_aie_runtime_static
+    iree::compiler::Dialect::HAL::IR
+    iree::compiler::Dialect::HAL::IR::HALDialect
 )
 
 add_subdirectory(test)
diff --git a/compiler/plugins/target/AMD-AIE/aievec/Passes.h b/compiler/plugins/target/AMD-AIE/aievec/Passes.h
index c1dabcf67..01b7d9232 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/aievec/Passes.h
@@ -22,15 +22,17 @@
 namespace mlir::iree_compiler::aievec {
 
 /**
- * Append pass(es) for canonicalizing operations in the vector dialect to a form
+ * Append passes for canonicalizing operations in the vector dialect to a form
  * that can be lowered to the AIEVec dialect.
  */
 void buildCanonicalizeVectorForAIEVec(mlir::OpPassManager &);
 
 /**
- * A pass containing patterns for canonicalizing operations in the vector
+ * A pass containing some patterns for canonicalizing operations in the vector
  * dialect to a form that can be lowered to the AIEVec dialect. This pass is
- * named `canonicalize-vector-for-aievec`.
+ * named `canonicalize-vector-for-aievec`. To ensure all required vector dialect
+ * canonicalizations take place, PassManagers should use
+ * `buildCanonicalizeVectorForAIEVec`.
  */
 std::unique_ptr<mlir::Pass> createCanonicalizeVectorForAIEVecPass();
 
@@ -39,6 +41,54 @@ std::unique_ptr<mlir::Pass> createCanonicalizeVectorForAIEVecPass();
  */
 void registerCanonicalizeVectorForAIEVecPass();
 
+/**
+ * This pass ensures that reads from AIE tile memory are aligned according to
+ * hardware constraints. For example, suppose we have 128 bytes in tile memory,
+ * represented in hex as:
+ *
+ *    0x00 0x01 ... 0x7E 0x7F
+ *
+ * On AIE-2, the (vector) read instructions from the tile memory into registers
+ * must be aligned to 256-bits (32-bytes). So if we want to read 64 bytes
+ * starting from 0x00 that is fine, but if we want to read 64 bytes starting
+ * from 0x01, then we cannot use a vector read instruction directly. To work
+ * around this constraint, we do the following:
+ *
+ * 1. Perform a wider read, that loads 128 bytes (2x as many as we want)
+ *    starting from 0x00 into a larger register. That is, bytes 0x00-0x7F are
+ *    loaded, so we have 1 'junk' byte at the beginning and 63 'junk' bytes at
+ *    the end.
+ *
+ * 2. Extract the target bytes 0x01 ... 0x40 from the larger register into a
+ *    smaller register in 2 steps, using 2 AIE specific instructions:
+ *
+ *   a) Extract:
+ *      https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html
+ *
+ *   b) Shift:
+ *      https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html
+ *
+ *   First, we use the extract instruction to split the read 128-bytes into two
+ *   halves, 0x00-0x3F and 0x40-0x7F, each in its own 64-byte register. Then, we
+ *   use a shift operation to combine the upper 31 bytes from the first half
+ *   and the lower 33 bytes from the second half into a new 64-byte register.
+ *   This new register contains exactly the 64 bytes we want to read, starting
+ *   from 0x01.
+ *
+ * If we want to read 32 bytes starting from 0x01, we can use a similar
+ * approach. The only consideration is that the shift operation requires 64-byte
+ * inputs, so the order of the of the shift and extracts is reversed.
+ *
+ * We do not currently support unaligned reads of vectors which are not 32-bytes
+ * or 64-bytes in length.
+ *
+ * TODO(newling) use this same approach to align writes to unaligned memory.
+ *  */
+
+std::unique_ptr<mlir::Pass> createAlignTransferReadsPass();
+
+void registerAlignTransferReadsPass();
+
 /**
  * Append pass(es) for lowering operations in the vector dialect to the AIEVec
  * dialect. Vector dialect ops are expected to be in a canonical form
@@ -48,7 +98,7 @@ void buildLowerVectorToAIEVec(mlir::OpPassManager &pm);
 
 /**
  * A pass containing patterns for lowering operations in the vector dialect to
- * the AIEVec dialect. The pass is currently named `test-lower-vector-to-aievec`.
+ * the AIEVec dialect. The pass is currently named `test-lower-vector-to-aievec`
  */
 static std::unique_ptr<mlir::Pass> createLowerVectorToAIEVec();
 
diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
index ecb036674..8e7801773 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
+++ b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
@@ -15,6 +15,9 @@
 #include <memory>
 
 #include "Passes.h"
+#include "aievec/AIEVecOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
@@ -300,7 +303,6 @@ class FlattenContiguousRowMajorTransferWritePattern
 
 }  // namespace copied_from_mlir
 
-
 static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
   if (op.getKind() != vector::CombiningKind::ADD) return false;
 
@@ -897,6 +899,7 @@ struct CanonicalizeVectorForAIEVecPass
       populateBubbleSignExtensionsLate(patterns);
       (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
     }
+
     {
       RewritePatternSet patterns(context);
       patterns
@@ -914,6 +917,7 @@ struct CanonicalizeVectorForAIEVecPass
       mlir::vector::populateVectorBroadcastLoweringPatterns(patterns);
       (void)applyPatternsAndFoldGreedily(op, std::move(patterns));
     }
+
     {
       // These must run after 'populateFlattenVectorTransferPatterns' because
       // vector.shape_casts are introduced. Merging into a single pass creates
@@ -925,6 +929,170 @@ struct CanonicalizeVectorForAIEVecPass
   }
 };
 
+/// Returns one of:
+/// 1) failure, if there is definitely an error that should be propagated.
+/// 2) a new transfer_read operation that is sufficiently aligned, if the old
+///    transfer_read is determined to be insufficiently aligned and it is
+///    possible to create a new transfer_read.
+/// 3) the original transfer_read operation, otherwise.
+FailureOr<Value> getAlignedTransferRead(
+    vector::TransferReadOp readOp, IRRewriter &rewriter,
+    const AMDAIE::AMDAIEDeviceModel &deviceModel) {
+  uint32_t vectorLoadStoreAlignmentBits =
+      deviceModel.getVectorLoadStoreAlignmentBits();
+  uint32_t maxVectorSizeBits = deviceModel.getMaxVectorSizeBits();
+  uint32_t shiftOperandBits = deviceModel.getShiftOperandBits();
+
+  // Check that it's not a splat transfer read.
+  if (readOp.getPermutationMap().isConstant()) return readOp.getVector();
+
+  MLIRContext *ctx = readOp.getContext();
+  VectorType shortType = readOp.getVectorType();
+  Location loc = readOp.getLoc();
+  Value padding = readOp.getPadding();
+  ShapedType sourceType = readOp.getSource().getType();
+  Type elementType = shortType.getElementType();
+
+  if (sourceType.getRank() != 1 || shortType.getRank() != 1) {
+    return readOp.emitOpError(
+        "does not have rank-1 source and rank-1 vector type.");
+  }
+
+  uint32_t elementBits = elementType.getIntOrFloatBitWidth();
+  int64_t shortLength = shortType.getShape().back();
+  int64_t shortBits = shortLength * elementBits;
+  uint32_t alignElements = vectorLoadStoreAlignmentBits / elementBits;
+
+  rewriter.setInsertionPoint(readOp);
+
+  AffineMap moduloMap =
+      AffineMap::get(1, 0, getAffineDimExpr(0, ctx) % alignElements);
+
+  Value oldIndex = readOp.getIndices().back();
+
+  Value offset = rewriter.createOrFold<affine::AffineApplyOp>(
+      loc, moduloMap, SmallVector<Value, 1>{oldIndex});
+
+  // If the offset is constant and zero, the read is already aligned.
+  if (auto offsetConstantOp = offset.getDefiningOp<arith::ConstantIndexOp>())
+    if (offsetConstantOp.getValue() == 0) return readOp.getVector();
+
+  // Verify that we can load a vector 2x as long as the original vector.
+  int64_t longBits = 2 * shortBits;
+  int64_t longLength = 2 * shortLength;
+  VectorType longType = VectorType::get(longLength, elementType);
+  if (longBits > maxVectorSizeBits) {
+    // Not returning failure, as it is possible that the read is already
+    // aligned, and we just couldn't prove it.
+    readOp.emitWarning()
+        << "`transfer_read` can't be aligned with a read twice "
+        << "as large because " << longBits
+        << " bits is greater than the maximum vector size of "
+        << maxVectorSizeBits << " bits.";
+
+    return readOp.getVector();
+  }
+
+  SmallVector<bool> inBounds = readOp.getInBoundsValues();
+  bool allInBounds =
+      std::all_of(inBounds.begin(), inBounds.end(), [](bool b) { return b; });
+
+  if (shortBits != shiftOperandBits / 2 && shortBits != shiftOperandBits) {
+    // Not returning failure, as it is possible that the read is already
+    // aligned, and we just couldn't prove it.
+    readOp.emitWarning() << "`transfer_read` doesn't have a vector with "
+                         << shiftOperandBits / 2 << " or " << shiftOperandBits
+                         << " bits."
+                         << "This case is not currently handled.";
+    return readOp.getVector();
+  }
+
+  Value newIndex = rewriter.createOrFold<arith::SubIOp>(loc, oldIndex, offset);
+
+  // Create the aligned transfer read for a vector 2x as long that covers the
+  // elements of the unaligned vector.
+  Value longVec = rewriter.create<vector::TransferReadOp>(
+      loc, longType, readOp.getSource(), SmallVector<Value>{newIndex}, padding,
+      SmallVector<bool>{allInBounds});
+
+  Value elementBytes =
+      rewriter.create<arith::ConstantIndexOp>(loc, elementBits / 8);
+
+  Value offsetBytes =
+      rewriter.createOrFold<arith::MulIOp>(loc, offset, elementBytes);
+
+  Value offsetBytes_i32 = rewriter.createOrFold<arith::IndexCastOp>(
+      loc, rewriter.getIntegerType(32), offsetBytes);
+
+  Value replacement;
+  if (shortBits == shiftOperandBits) {
+    // - Extract lower 64 bytes
+    // - Extract upper 64 bytes
+    // - Apply shift to obtain new 64 bytes
+    Value low = rewriter.create<ExtOp>(loc, shortType, longVec,
+                                       rewriter.getI8IntegerAttr(0));
+    Value upp = rewriter.create<ExtOp>(loc, shortType, longVec,
+                                       rewriter.getI8IntegerAttr(1));
+    replacement = rewriter.createOrFold<ShiftOp>(loc, shortType, low, upp,
+                                                 offsetBytes_i32);
+  } else if (shortBits == shiftOperandBits / 2) {
+    // - Apply shift to obtain new 64 bytes, bottom 32 being the required ones
+    // - Extract lower 32 bytes
+    Value shift = rewriter.createOrFold<ShiftOp>(loc, longType, longVec,
+                                                 longVec, offsetBytes_i32);
+    replacement = rewriter.create<ExtOp>(loc, shortType, shift,
+                                         rewriter.getI8IntegerAttr(0));
+  } else {
+    assert(false &&
+           "unreachable: already checked that shortBytes is equal to or half "
+           "of shiftOperandBytes");
+  }
+
+  rewriter.replaceOp(readOp, replacement);
+
+  return replacement;
+}
+
+struct AlignTransferReadsPass
+    : public PassWrapper<AlignTransferReadsPass, OperationPass<>> {
+  StringRef getArgument() const final { return "align-transfer-reads"; }
+
+  StringRef getDescription() const final {
+    return "Align `vector.transfer_read` operations.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry
+        .insert<arith::ArithDialect, memref::MemRefDialect,
+                vector::VectorDialect, affine::AffineDialect, AIEVecDialect>();
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+
+    auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
+    std::optional<AMDAIE::AMDAIEDevice> maybeDevice =
+        mlir::iree_compiler::AMDAIE::getConfigAMDAIEDevice(targetAttr);
+    if (!maybeDevice) {
+      op->emitOpError()
+          << "has no AMDAIEDevice in the target attribute configuration. This "
+             "device-specific information is required to determine what vector "
+             "sizes and alignments are supported.";
+      return signalPassFailure();
+    }
+    AMDAIE::AMDAIEDeviceModel deviceModel =
+        AMDAIE::getDeviceModel(maybeDevice.value());
+
+    IRRewriter rewriter(&getContext());
+    op->walk([&](vector::TransferReadOp transferReadOp) {
+      if (failed(
+              getAlignedTransferRead(transferReadOp, rewriter, deviceModel))) {
+        signalPassFailure();
+      }
+    });
+  }
+};
+
 struct DetectNonCanonicalOpsPass
     : public PassWrapper<DetectNonCanonicalOpsPass, OperationPass<>> {
   StringRef getArgument() const final {
@@ -943,7 +1111,7 @@ struct DetectNonCanonicalOpsPass
   }
 
   void runOnOperation() override {
-    auto op = getOperation();
+    Operation *op = getOperation();
     MLIRContext *context = &getContext();
     RewritePatternSet patterns(context);
     ConversionTarget target(*context);
@@ -955,8 +1123,8 @@ struct DetectNonCanonicalOpsPass
 };
 
 void buildCanonicalizeVectorForAIEVec(OpPassManager &pm) {
-  // TODO: Add passes to split vectors that won't fit in registers
   pm.addPass(createCanonicalizeVectorForAIEVecPass());
+  pm.addPass(createAlignTransferReadsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(std::make_unique<DetectNonCanonicalOpsPass>());
 }
@@ -971,4 +1139,14 @@ void registerCanonicalizeVectorForAIEVecPass() {
   });
 }
 
+std::unique_ptr<::mlir::Pass> createAlignTransferReadsPass() {
+  return std::make_unique<AlignTransferReadsPass>();
+}
+
+void registerAlignTransferReadsPass() {
+  ::mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
+    return std::make_unique<AlignTransferReadsPass>();
+  });
+}
+
 }  // namespace mlir::iree_compiler::aievec
diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt
index 9560a848f..d9e7c36cb 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt
@@ -1,4 +1,13 @@
-file(GLOB _mlir_files *.mlir)
+set(_mlir_files
+  align-transfer-reads.mlir
+  fold-ops.mlir
+  matmul.mlir
+  precanonicalization-aieml-llvmir.mlir
+  test-mac_elem.mlir
+  test-shuffle.mlir
+  test-srs.mlir
+  test-ups.mlir
+)
 
 iree_lit_test_suite(
   NAME
diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir
new file mode 100644
index 000000000..b312b0022
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir
@@ -0,0 +1,163 @@
+// RUN: iree-opt %s --align-transfer-reads  --verify-diagnostics  -split-input-file | FileCheck %s
+
+// check for affine_map that is used to determine the slice index of the original
+// transfer_read operation:
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 192 + s1 * 48 + s2 * 8)>
+
+// check for affine_map that is used to round the original slice index down to the
+// nearest multiple of acceptable alignment (256 bits = 32 bytes = 16 bf16 elms).
+// This map computes the remainder after rounding down.
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 mod 16)>
+
+// CHECK: func.func @test_bf16_0
+// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : bf16
+// CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<576xbf16>
+
+// check for the application of MAP0:
+// CHECK: %[[APPLY0:.*]] = affine.apply #[[MAP0]]
+
+// check for the application of MAP1 on the result of MAP0. This is the
+// remainder (number of bf16 elements).
+// CHECK: %[[REM_BF16:.*]] = affine.apply #[[MAP1]](%[[APPLY0]])
+
+// get the index of the new transfer_read. This is the original index, minus the remainder:
+// CHECK: %[[NEW_INDEX:.*]] = arith.subi %[[APPLY0]], %[[REM_BF16]]
+
+// the new transfer_read (to a vector of 64 bf16s).
+// CHECK: %[[NEW_READ:.*]] = vector.transfer_read
+// CHECK-SAME: %[[ALLOC]][%[[NEW_INDEX]]], %[[CST]]
+// CHECK-SAME: {in_bounds = [true]}
+// CHECK-SAME: memref<576xbf16>, vector<64xbf16>
+
+// compute the remainder in bytes. This is needed in aievec.shift.
+// CHECK: %[[BYTES_PER_BF16:.*]] = arith.constant 2 : index
+// CHECK: %[[REM_BYTES_INDEX:.*]] = arith.muli %[[REM_BF16]], %[[BYTES_PER_BF16]] : index
+// CHECK: %[[REM_BYTES_I32:.*]] = arith.index_cast %[[REM_BYTES_INDEX]] : index to i32
+
+// the extraction ops, that copy the lower and upper halves of the 1024-bit vector to two 512-bit vectors.
+// CHECK-DAG: %[[EXT0:.*]] = aievec.ext %[[NEW_READ]] {index = 0 : i8} : vector<64xbf16>, vector<32xbf16>
+// CHECK-DAG: %[[EXT1:.*]] = aievec.ext %[[NEW_READ]] {index = 1 : i8} : vector<64xbf16>, vector<32xbf16>
+
+// the shift op, which concats the upper bits from EXT0 and the lower bits from EXT1.
+// CHECK: %[[SHIFT:.*]] = aievec.shift %[[EXT0]], %[[EXT1]], %[[REM_BYTES_I32]]
+// CHECK-SAME: {isAcc = false} : vector<32xbf16>, vector<32xbf16>, i32, vector<32xbf16>
+
+#map = affine_map<()[s0, s1, s2] -> (s0 * 192 + s1 * 48 + s2 * 8)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+func.func @test_bf16_0(%arg0: index, %arg1: index, %arg2: index) -> vector<32xbf16> {
+  %cst = arith.constant 0.000000e+00 : bf16
+  %alloc = memref.alloc() : memref<576xbf16>
+  %0 = affine.apply #map()[%arg0, %arg2, %arg1]
+  %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xbf16>, vector<32xbf16>
+  return %1 : vector<32xbf16>
+}
+}
+
+
+// -----
+
+// An equivalent test to the above, but this time with i8 elements.
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 mod 32)>
+// CHECK: func.func @test_i8_64bytes
+// CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8
+// CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<576xi8>
+// CHECK: %[[APPLY0:.*]] = affine.apply #[[MAP0]]
+// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]](%[[APPLY0]])
+// CHECK: %[[NEW_INDEX:.*]] = arith.subi %[[APPLY0]], %[[REM]]
+// CHECK: %[[NEW_READ:.*]] = vector.transfer_read
+// CHECK-SAME: %[[ALLOC]][%[[NEW_INDEX]]], %[[C0_I8]]
+// CHECK-SAME: {in_bounds = [true]}
+// CHECK-SAME: memref<576xi8>, vector<128xi8>
+// CHECK-DAG: %[[EXT0:.*]] = aievec.ext %[[NEW_READ]] {index = 0 : i8} : vector<128xi8>, vector<64xi8>
+// CHECK-DAG: %[[EXT1:.*]] = aievec.ext %[[NEW_READ]] {index = 1 : i8} : vector<128xi8>, vector<64xi8>
+// CHECK-DAG: %[[REM_I32:.*]] = arith.index_cast %[[REM]] : index to i32
+// CHECK: %[[SHIFT:.*]] = aievec.shift %[[EXT0]], %[[EXT1]], %[[REM_I32]]
+// CHECK-SAME: {isAcc = false} : vector<64xi8>, vector<64xi8>, i32, vector<64xi8>
+// CHECK: return %[[SHIFT]] : vector<64xi8>
+
+#map = affine_map<()[s0] -> (s0 * 8)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+func.func @test_i8_64bytes(%arg0: index) -> vector<64xi8> {
+  %cst = arith.constant 0 : i8
+  %alloc = memref.alloc() : memref<576xi8>
+  %0 = affine.apply #map()[%arg0]
+  %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<64xi8>
+  return %1 : vector<64xi8>
+}
+}
+
+// -----
+
+// An equivalent test to the above, but this time the vector only has 32 i8s
+// (32 bytes). In this case, the order of the extraction op and shift op is
+// reversed, because shift expects 64-byte input vectors.
+
+// CHECK-LABEL: func.func @test_i8_32bytes
+// CHECK: %[[READ:.*]] = vector.transfer_read
+// CHECK: %[[SHIFT:.*]] = aievec.shift %[[READ]], %[[READ]]
+// CHECK-SAME: vector<64xi8>, vector<64xi8>, i32, vector<64xi8>
+// CHECK: %[[EXT:.*]] = aievec.ext %[[SHIFT]] {index = 0 : i8}
+// CHECK-SAME: vector<64xi8>, vector<32xi8>
+// CHECK: return %[[EXT]] : vector<32xi8>
+
+#map = affine_map<()[s0] -> (s0 * 8)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+func.func @test_i8_32bytes(%arg0: index) -> vector<32xi8> {
+  %cst = arith.constant 0 : i8
+  %alloc = memref.alloc() : memref<576xi8>
+  %0 = affine.apply #map()[%arg0]
+  %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<32xi8>
+  return %1 : vector<32xi8>
+}
+}
+
+// -----
+
+#map = affine_map<()[s0] -> (s0 * 8)>
+// expected-error @+1 {{'builtin.module' op has no AMDAIEDevice in the target attribute configuration.}}
+module {
+func.func @test_i8_32bytes(%arg0: index) -> vector<32xi8> {
+  %cst = arith.constant 0 : i8
+  %alloc = memref.alloc() : memref<576xi8>
+  %0 = affine.apply #map()[%arg0]
+  %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<32xi8>
+  return %1 : vector<32xi8>
+}
+}
+
+// -----
+
+// An equivalent test to the above, but this time the vector only has 16 i8s
+// (16 bytes). We currently don't support this.
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+func.func @test_i8_16bytes(%arg0: index) -> vector<16xi8> {
+  %cst = arith.constant 0 : i8
+  %alloc = memref.alloc() : memref<576xi8>
+  // expected-warning @+1 {{`transfer_read` doesn't have a vector with 256 or 512 bits.This case is not currently handled}}
+  %1 = vector.transfer_read %alloc[%arg0], %cst {in_bounds = [true]} : memref<576xi8>, vector<16xi8>
+  return %1 : vector<16xi8>
+}
+}
+
+// -----
+
+// An equivalent test to the above, but this time the vector has 128 bytes.
+
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+func.func @test_i8_128bytes(%arg0: index) -> vector<128xi8> {
+  %cst = arith.constant 0 : i8
+  %alloc = memref.alloc() : memref<576xi8>
+  // expected-warning @+1 {{`transfer_read` can't be aligned with a read twice as large because 2048 bits is greater than the maximum vector size of 1024 bits.}}
+  %1 = vector.transfer_read %alloc[%arg0], %cst {in_bounds = [true]} : memref<576xi8>, vector<128xi8>
+  return %1 : vector<128xi8>
+}
+}
+
+
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
index 9fe88cd64..1e6a56605 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp
@@ -35,6 +35,7 @@ struct AMDAIESession
     AMDAIE::registerAIRConversionPasses();
     AMDAIE::registerAIRTransformPasses();
     aievec::registerConvertAIEVecToLLVMPass();
+    aievec::registerAlignTransferReadsPass();
     aievec::registerCanonicalizeVectorForAIEVecPass();
     aievec::registerLowerVectorToAIEVecPass();
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 481591212..d30d93e3d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -479,8 +479,6 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(opts));
 
   // Vectorization passes
-  // FIXME(newling) https://github.com/nod-ai/iree-amd-aie/issues/820
-  enableVectorizationPasses = false;
   appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses);
   funcPassManager.addPass(createCanonicalizerPass());
 
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
index e9a227d25..b00c184e7 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
@@ -17,11 +17,13 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
 #include "macros.h"
+
 // clang-format off
 #include "iree-amd-aie/aie_runtime/AMDAIEEnums.h"
 // clang-format on
 
 extern "C" {
+
 #include "xaie_hwcfg.h"
 #include "xaiengine.h"
 #include "xaiengine/xaie_device_aieml.h"
@@ -204,7 +206,9 @@ enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 };
  * This struct is meant to be a thin wrapper around aie-rt, which provides
  * the canonical representation/metadata for AIE devices; attributes like number
  * of locks, bds per tile, whether certain switch connections are legal or not,
- * etc.
+ * etc. In addition this struct is meant to contain generational specific AIE
+ * VLIW processor constants, such as sizes of vectors supported for
+ * load/store/matmul etc.
  *
  * This representation is parameterized by platform specific constants
  * (BASE_ADDR, COL/ROW shift, NUM_MEM_TILE_ROWS, etc.) which are available in
@@ -219,8 +223,9 @@ enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 };
  */
 struct AMDAIEDeviceModel {
   /// Contains additional device config parameters that can't be retrieved from
-  /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in
-  /// another way before adding new fields to this struct.
+  /// aie-rt or elsewhere for whatever reason. Make sure the parameters can't be
+  /// retrieved in another way before adding new fields to this struct.
+
   struct AMDAIEDeviceConfig {
     /// Set default minimum stride bitwidth/addressing granularity to 32 bits as
     /// this is the value for all current architecture versions.
@@ -234,8 +239,26 @@ struct AMDAIEDeviceModel {
     uint8_t streamSwitchMemTileMSelMax{0};
     uint8_t streamSwitchShimArbiterMax{0};
     uint8_t streamSwitchShimMSelMax{0};
+
+    //////////////////////////////
+    // VLIW processor constants //
+    //////////////////////////////
+    /// The number of bits that L1 memory must be aligned by in order
+    /// to be loaded/stored into a register with a vector instruction. See for
+    /// example:
+    /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__intr__loadstore.html
+    uint32_t vectorLoadStoreAlignmentBits{256};
+    /// The largest vector size supported. See for example:
+    /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__group__datatype__vector.html
+    uint32_t maxVectorSizeBits{1024};
+    /// The number of bits that each of the two vector operands of the shift
+    /// intrinsic must have. See for example
+    /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html
+    uint32_t shiftOperandBits{512};
+
     AMDAIEDeviceConfig() = default;
   };
+
   XAie_Config configPtr;
   XAie_DevInst devInst;
   AMDAIEDeviceConfig deviceConfig;
@@ -328,6 +351,14 @@ struct AMDAIEDeviceModel {
   uint8_t getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const;
   uint8_t getStreamSwitchMSelMax(uint8_t col, uint8_t row) const;
 
+  uint32_t getVectorLoadStoreAlignmentBits() const {
+    return deviceConfig.vectorLoadStoreAlignmentBits;
+  }
+
+  uint32_t getMaxVectorSizeBits() const { return deviceConfig.maxVectorSizeBits; }
+
+  uint32_t getShiftOperandBits() const { return deviceConfig.shiftOperandBits; }
+
   /// Return a map from channels to valid BD ids for the requested tile type.
   /// TODO(jornt): find these ranges in the device model.
   DenseMap<uint32_t, SmallVector<uint32_t>> getChannelToValidBdIds(