From eb24363d34eccbc4ea53ab81f3167d8e041a5f56 Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <t.y.mathetake@gmail.com>
Date: Thu, 23 May 2024 16:12:40 +0900
Subject: [PATCH] ssa: empirically faster passRedundantPhiEliminationOpt
 (#2214)

Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
---
 internal/engine/wazevo/ssa/pass.go      | 11 ++++++++---
 internal/engine/wazevo/ssa/pass_test.go |  3 +++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/internal/engine/wazevo/ssa/pass.go b/internal/engine/wazevo/ssa/pass.go
index 0dfd132cf0..c7ebb15218 100644
--- a/internal/engine/wazevo/ssa/pass.go
+++ b/internal/engine/wazevo/ssa/pass.go
@@ -22,9 +22,9 @@ func (b *builder) RunPasses() {
 func (b *builder) runPreBlockLayoutPasses() {
 	passSortSuccessors(b)
 	passDeadBlockEliminationOpt(b)
-	passRedundantPhiEliminationOpt(b)
 	// The result of passCalculateImmediateDominators will be used by various passes below.
 	passCalculateImmediateDominators(b)
+	passRedundantPhiEliminationOpt(b)
 	passNopInstElimination(b)
 
 	// TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic.
@@ -109,6 +109,8 @@ func passDeadBlockEliminationOpt(b *builder) {
 }
 
 // passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block).
+// This requires the reverse post-order traversal to be calculated before calling this function,
+// hence passCalculateImmediateDominators must be called before this.
 func passRedundantPhiEliminationOpt(b *builder) {
 	redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations.
 
@@ -118,11 +120,14 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	//  relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs,
 	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
 	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
+	//  -- Note --
+	// 	Currently, each iteration can run in an order of blocks, but it empirically converges quickly in practice when
+	// 	running on the reverse post-order. It might be possible to optimize this further by using the dominator tree.
 	for {
 		changed := false
-		_ = b.blockIteratorBegin() // skip entry block!
+		_ = b.blockIteratorReversePostOrderBegin() // skip entry block!
 		// Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops!
-		for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() {
+		for blk := b.blockIteratorReversePostOrderNext(); blk != nil; blk = b.blockIteratorReversePostOrderNext() {
 			paramNum := len(blk.params)
 
 			for paramIndex := 0; paramIndex < paramNum; paramIndex++ {
diff --git a/internal/engine/wazevo/ssa/pass_test.go b/internal/engine/wazevo/ssa/pass_test.go
index 83d33fbee4..015a10d088 100644
--- a/internal/engine/wazevo/ssa/pass_test.go
+++ b/internal/engine/wazevo/ssa/pass_test.go
@@ -167,6 +167,9 @@ blk3: () <-- (blk1,blk2)
 					ret.AsReturn(ValuesNil)
 					b.InsertInstruction(ret)
 				}
+
+				// passRedundantPhiEliminationOpt requires the reverse post-order traversal to be calculated.
+				passCalculateImmediateDominators(b)
 				return nil
 			},
 			before: `