From eb24363d34eccbc4ea53ab81f3167d8e041a5f56 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Thu, 23 May 2024 16:12:40 +0900 Subject: [PATCH] ssa: empirically faster passRedundantPhiEliminationOpt (#2214) Signed-off-by: Takeshi Yoneda --- internal/engine/wazevo/ssa/pass.go | 11 ++++++++--- internal/engine/wazevo/ssa/pass_test.go | 3 +++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/engine/wazevo/ssa/pass.go b/internal/engine/wazevo/ssa/pass.go index 0dfd132cf0..c7ebb15218 100644 --- a/internal/engine/wazevo/ssa/pass.go +++ b/internal/engine/wazevo/ssa/pass.go @@ -22,9 +22,9 @@ func (b *builder) RunPasses() { func (b *builder) runPreBlockLayoutPasses() { passSortSuccessors(b) passDeadBlockEliminationOpt(b) - passRedundantPhiEliminationOpt(b) // The result of passCalculateImmediateDominators will be used by various passes below. passCalculateImmediateDominators(b) + passRedundantPhiEliminationOpt(b) passNopInstElimination(b) // TODO: implement either conversion of irreducible CFG into reducible one, or irreducible CFG detection where we panic. @@ -109,6 +109,8 @@ func passDeadBlockEliminationOpt(b *builder) { } // passRedundantPhiEliminationOpt eliminates the redundant PHIs (in our terminology, parameters of a block). +// This requires the reverse post-order traversal to be calculated before calling this function, +// hence passCalculateImmediateDominators must be called before this. func passRedundantPhiEliminationOpt(b *builder) { redundantParameterIndexes := b.ints[:0] // reuse the slice from previous iterations. @@ -118,11 +120,14 @@ func passRedundantPhiEliminationOpt(b *builder) { // relatively small. For example, sqlite speedtest binary results in the large number of redundant PHIs, // the maximum number of iteration was 22, which seems to be acceptable but not that small either since the // complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands. + // -- Note -- + // Currently, each iteration can run in an order of blocks, but it empirically converges quickly in practice when + // running on the reverse post-order. It might be possible to optimize this further by using the dominator tree. for { changed := false - _ = b.blockIteratorBegin() // skip entry block! + _ = b.blockIteratorReversePostOrderBegin() // skip entry block! // Below, we intentionally use the named iteration variable name, as this comes with inevitable nested for loops! - for blk := b.blockIteratorNext(); blk != nil; blk = b.blockIteratorNext() { + for blk := b.blockIteratorReversePostOrderNext(); blk != nil; blk = b.blockIteratorReversePostOrderNext() { paramNum := len(blk.params) for paramIndex := 0; paramIndex < paramNum; paramIndex++ { diff --git a/internal/engine/wazevo/ssa/pass_test.go b/internal/engine/wazevo/ssa/pass_test.go index 83d33fbee4..015a10d088 100644 --- a/internal/engine/wazevo/ssa/pass_test.go +++ b/internal/engine/wazevo/ssa/pass_test.go @@ -167,6 +167,9 @@ blk3: () <-- (blk1,blk2) ret.AsReturn(ValuesNil) b.InsertInstruction(ret) } + + // passRedundantPhiEliminationOpt requires the reverse post-order traversal to be calculated. + passCalculateImmediateDominators(b) return nil }, before: `