From 045be18f9c37c6236b9414b21c1fa6164b2af6f4 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 6 Nov 2023 21:12:14 +0200 Subject: [PATCH 01/45] [mono][interp] Preliminary phases for SSA computation DFS traversal of CFG to obtain list of bblocks in reverse postorder. Compute immediate dominators for each bblock in the `td->idoms` table. Compute dominance frontiers for each bblock as a bit set. These algorithms are based on `A Simple, Fast Dominance Algorithm by Keith D. Cooper, Timothy J. Harvey, and Ken Kennedy` Obtain list of global vars of interest for SSA transformation. These are vars that are used in a different bblock than the one they are declared in and for which we might end up creating phi nodes. We also compute the list of bblocks where they are declared, needed for phi node insertion. --- src/mono/mono/mini/interp/transform-opt.c | 315 +++++++++++++++++++++- src/mono/mono/mini/interp/transform.c | 15 +- src/mono/mono/mini/interp/transform.h | 14 + 3 files changed, 322 insertions(+), 22 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index c6fc3a069c045d..aacacbbbe86432 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -5,6 +5,10 @@ #include "mintops.h" #include "transform.h" +/* + * VAR OFFSET ALLOCATOR + */ + // Allocates var at the offset that tos points to, also updating it. static int alloc_var_offset (TransformData *td, int local, gint32 *ptos) @@ -215,7 +219,6 @@ end_active_call (TransformData *td, ActiveCalls *ac, InterpInst *call) } // Data structure used for offset allocation of local vars - typedef struct { int var; gboolean is_alive; @@ -494,6 +497,10 @@ interp_alloc_offsets (TransformData *td) td->total_locals_size = ALIGN_TO (final_total_locals_size, MINT_STACK_ALIGNMENT); } +/* + * DOMINANCE COMPUTATION + */ + static GString* interp_get_bb_links (InterpBasicBlock *bb) { @@ -520,6 +527,294 @@ interp_get_bb_links (InterpBasicBlock *bb) return str; } +static void +dfs_visit (InterpBasicBlock *bb, int *pos, InterpBasicBlock **bb_array) +{ + int dfs_index = *pos; + + bb_array [dfs_index] = bb; + bb->dfs_index = dfs_index; + if (dfs_index != 0) + g_assert (bb->in_count); + *pos = dfs_index + 1; + for (int i = 0; i < bb->out_count; i++) { + InterpBasicBlock *out_bb = bb->out_bb [i]; + if (out_bb->dfs_index == -1) + dfs_visit (out_bb, pos, bb_array); + } +} + +static void +interp_compute_dfs_indexes (TransformData *td) +{ + int dfs_index = 0; + // Sort bblocks in reverse postorder + td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + g_assert (!td->entry_bb->in_count); + dfs_visit (td->entry_bb, &dfs_index, td->bblocks); + td->bblocks_count = dfs_index; + + if (td->verbose_level) { + InterpBasicBlock *bb; + g_print ("\nBASIC BLOCK GRAPH:\n"); + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + GString* bb_info = interp_get_bb_links (bb); + g_print ("BB%d: DFS(%d), %s\n", bb->index, bb->dfs_index, bb_info->str); + g_string_free (bb_info, TRUE); + } + } +} + +static InterpBasicBlock* +dom_intersect (InterpBasicBlock **idoms, InterpBasicBlock *bb1, InterpBasicBlock *bb2) +{ + while (bb1 != bb2) { + while (bb1->dfs_index < bb2->dfs_index) + bb2 = idoms [bb2->dfs_index]; + while (bb2->dfs_index < bb1->dfs_index) + bb1 = idoms [bb1->dfs_index]; + } + return bb1; +} + +static void +interp_compute_dominators (TransformData *td) +{ + InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + + idoms [0] = td->entry_bb; + gboolean changed = TRUE; + while (changed) { + changed = FALSE; + // all bblocks in reverse post order except entry + for (int i = 1; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + InterpBasicBlock *new_idom = NULL; + // pick candidate idom from first processed predecessor of it + int j; + for (j = 0; j < bb->in_count; j++) { + InterpBasicBlock *in_bb = bb->in_bb [j]; + if (idoms [in_bb->dfs_index]) { + new_idom = in_bb; + break; + } + } + + // intersect new_idom with dominators from the other predecessors + for (; j < bb->in_count; j++) { + InterpBasicBlock *in_bb = bb->in_bb [j]; + if (idoms [in_bb->dfs_index]) + new_idom = dom_intersect (idoms, in_bb, new_idom); + } + + // check if we obtained new idom + if (idoms [i] != new_idom) { + idoms [i] = new_idom; + changed = TRUE; + } + } + } + + td->idoms = idoms; + + // Build `dominated` bblock list for each bblock + for (int i = 1; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + InterpBasicBlock *idom = td->idoms [i]; + if (idom) + idom->dominated = g_slist_prepend (idom->dominated, bb); + } + + if (td->verbose_level) { + InterpBasicBlock *bb; + g_print ("\nBASIC BLOCK IDOMS:\n"); + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (bb->dfs_index == -1) + continue; + g_print ("IDOM (BB%d) = BB%d\n", bb->index, td->idoms [bb->dfs_index]->index); + } + + g_print ("\nBASIC BLOCK DOMINATED:\n"); + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (bb->dfs_index == -1) + continue; + if (bb->dominated) { + g_print ("DOMINATED (BB%d) = {", bb->index); + GSList *dominated = bb->dominated; + while (dominated) { + InterpBasicBlock *dominated_bb = (InterpBasicBlock*)dominated->data; + g_print (" BB%d", dominated_bb->index); + dominated = dominated->next; + } + g_print (" }\n"); + } + } + } +} + +static void +interp_compute_dominance_frontier (TransformData *td) +{ + int bitsize = mono_bitset_alloc_size (td->bb_count, 0); + char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bb_count); + + for (int i = 0; i < td->bblocks_count; i++) { + td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bb_count, 0); + mem += bitsize; + } + + for (int i = 0; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + + if (bb->in_count > 1) { + for (int j = 0; j < bb->in_count; ++j) { + InterpBasicBlock *p = bb->in_bb [j]; + + g_assert (p->dfs_index || p == td->entry_bb); + + while (p != td->idoms [bb->dfs_index]) { + mono_bitset_set_fast (p->dfrontier, bb->dfs_index); + p = td->idoms [p->dfs_index]; + } + } + } + } + + if (td->verbose_level) { + InterpBasicBlock *bb; + g_print ("\nBASIC BLOCK DFRONTIERS:\n"); + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (bb->dfs_index == -1) + continue; + g_print ("DFRONTIER (BB%d) = {", bb->index); + int i; + mono_bitset_foreach_bit (bb->dfrontier, i, td->bb_count) { + g_print (" BB%d", td->bblocks [i]->index); + } + g_print (" }\n"); + } + } +} + +static void +interp_compute_dominance (TransformData *td) +{ + /* + * A dominator for a bblock n, is a bblock that is reached on every path to n. Dominance is transitive. + * An immediate dominator for a bblock n, is the bblock that dominates n, but doesn't dominate any other + * dominators of n, meaning it is the closest dominator to n. The dominance frontier of a node V is the set + * of nodes where the dominance stops. This means that it is the set of nodes where node V doesn't dominate + * it, but it does dominate a predecessor of it (including if the predecessor is V itself). + * + * The dominance frontier is relevant for SSA computation since, for a var defined in a bblock, the DF of bblock + * represents the set of bblocks where we need to add a PHI opcode for that variable. + */ + interp_compute_dfs_indexes (td); + + interp_compute_dominators (td); + + interp_compute_dominance_frontier (td); +} + +/* + * SSA TRANSFORMATION + */ + +static void +compute_global_var_cb (TransformData *td, int *pvar, gpointer data) +{ + int var = *pvar; + InterpBasicBlock *bb = (InterpBasicBlock*)data; + InterpVar *var_data = &td->vars [var]; + // If var is used in another block than the one that it is declared then mark it as global + if (var_data->declare_bbs) { + if (var_data->declare_bbs->data != bb || var_data->declare_bbs->next) + var_data->ssa_global = TRUE; + } +} + +// We obtain the list of global vars, as well as the list of bblocks where each one of the global vars is declared. +static void +interp_compute_global_vars (TransformData *td) +{ + InterpBasicBlock *bb; + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + InterpInst *ins; + for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + interp_foreach_ins_svar (td, ins, bb, compute_global_var_cb); + if (mono_interp_op_dregs [ins->opcode]) { + // Save the list of bblocks where a global var is defined in + InterpVar *var_data = &td->vars [ins->dreg]; + if (!var_data->declare_bbs) + var_data->declare_bbs = g_slist_prepend (NULL, bb); + else if (!g_slist_find (var_data->declare_bbs, bb)) + var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); + } + } + } + + if (td->verbose_level) { + g_print ("\nSSA GLOBALS:\n"); + for (unsigned int i = 0; i < td->vars_size; i++) { + if (td->vars [i].ssa_global) { + g_print ("DECLARE_BB (%d) = {", i); + GSList *l = td->vars [i].declare_bbs; + while (l) { + g_print (" BB%d", ((InterpBasicBlock*)l->data)->index); + l = l->next; + } + g_print (" }\n"); + } + } + } +} + +static void +insert_phi_nodes () +{ + // TODO +} + +static void +rename_vars () +{ + // TODO +} + +static void +interp_compute_ssa (TransformData *td) +{ + interp_compute_dominance (td); + + interp_compute_global_vars (td); + + insert_phi_nodes (); + + rename_vars (); +} + +static void +interp_exit_ssa (TransformData *td) +{ + for (unsigned int i = 0; i < td->vars_size; i++) { + if (td->vars [i].declare_bbs) { + g_slist_free (td->vars [i].declare_bbs); + td->vars [i].declare_bbs = NULL; + } + } + + for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (bb->dominated) { + g_slist_free (bb->dominated); + bb->dominated = NULL; + } + } +} + +/* + * BASIC BLOCK OPTIMIZATION + */ + static void mark_bb_as_dead (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock *replace_bb) { @@ -544,6 +839,8 @@ mark_bb_as_dead (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock *repl break; } + if (bb->dominated) + g_slist_free (bb->dominated); bb->dead = TRUE; // bb should never be used/referenced after this } @@ -2562,19 +2859,11 @@ interp_super_instructions (TransformData *td) void interp_optimize_code (TransformData *td) { - if (mono_interp_opt & INTERP_OPT_BBLOCKS) - interp_optimize_bblocks (td); - - if (mono_interp_opt & INTERP_OPT_CPROP) - MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); + if (td->header->num_clauses) + return; - // After this point control optimizations on control flow can no longer happen, so we can determine - // which vars are global. This helps speed up the super instructions pass, which only operates on - // single def, single use local vars. - initialize_global_vars (td); + interp_compute_ssa (td); - if ((mono_interp_opt & INTERP_OPT_SUPER_INSTRUCTIONS) && - (mono_interp_opt & INTERP_OPT_CPROP)) - MONO_TIME_TRACK (mono_interp_stats.super_instructions_time, interp_super_instructions (td)); + interp_exit_ssa (td); } diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 6f4a46e0b653bf..3aa894b4775000 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -3800,6 +3800,7 @@ interp_alloc_bb (TransformData *td) bb->native_offset = -1; bb->stack_height = -1; bb->index = td->bb_count++; + bb->dfs_index = -1; return bb; } @@ -4705,20 +4706,16 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, td->in_start = td->ip = header->code; end = td->ip + header->code_size; - td->cbb = td->entry_bb = (InterpBasicBlock*)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock)); + td->cbb = td->entry_bb = interp_alloc_bb (td); if (td->gen_sdb_seq_points) td->basic_blocks = g_list_prepend_mempool (td->mempool, td->basic_blocks, td->cbb); - td->cbb->index = td->bb_count++; - td->cbb->native_offset = -1; td->cbb->stack_height = GPTRDIFF_TO_INT (td->sp - td->stack); - if (inlining) { - exit_bb = (InterpBasicBlock*)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock)); - exit_bb->index = td->bb_count++; - exit_bb->native_offset = -1; - exit_bb->stack_height = -1; - } + if (inlining) + exit_bb = interp_alloc_bb (td); + else + td->entry_bb->il_offset = 0; il_targets = mono_bitset_mem_new ( mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (header->code_size, 0)), diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 1949698ca4a7a8..219f7defe3f67e 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -98,6 +98,15 @@ struct _InterpBasicBlock { gint16 out_count; InterpBasicBlock **out_bb; + /* Index into td->bblocks */ + int dfs_index; + + /* Dominance frontier for this bblock */ + MonoBitSet *dfrontier; + + /* List of bblocks that are immediately dominated by this bblock */ + GSList *dominated; + /* The real native offset of this bblock, computed when emitting the instructions in the code stream */ int native_offset; /* @@ -174,6 +183,7 @@ typedef struct { int indirects; int offset; int size; + GSList *declare_bbs; union { // live_start and live_end are used by the offset allocator for optimized code int live_start; @@ -200,6 +210,7 @@ typedef struct { guint unknown_use : 1; guint local_only : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now + guint ssa_global: 1; } InterpVar; typedef struct @@ -250,6 +261,9 @@ typedef struct GPtrArray *seq_points; InterpBasicBlock **offset_to_bb; InterpBasicBlock *entry_bb, *cbb; + InterpBasicBlock **bblocks; // ordering of bblocks in reverse postorder dfs + int bblocks_count; + InterpBasicBlock **idoms; // immediate dominator for each bblock, index from reverse postorder dfs int bb_count; MonoMemPool *mempool; MonoMemoryManager *mem_manager; From 37d98ea9e409e3cc7ff57a7459f20f6c45e83e70 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 6 Nov 2023 22:34:22 +0200 Subject: [PATCH 02/45] [mono][interp] Insert phi nodes For a var, we insert a phi node at each bblock in the dominance frontier of each of the bblocks where the var is defined. The MINT_PHI opcode has a variable number of sregs, one for each incoming bblock edge. Once the optimizations are finished (currently none), we just replace the phi opcodes with nop. --- src/mono/mono/mini/interp/mintops.def | 1 + src/mono/mono/mini/interp/transform-opt.c | 74 ++++++++++++++++++++++- src/mono/mono/mini/interp/transform.c | 9 ++- src/mono/mono/mini/interp/transform.h | 1 + 4 files changed, 81 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index 7dd9d914fa328e..da619fb4ecadd8 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -840,6 +840,7 @@ IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortI // These two opcodes are resolved to a normal MINT_MOV when emitting compacted instructions IROPDEF(MINT_MOV_SRC_OFF, "mov.src.off", 6, 1, 1, MintOpTwoShorts) IROPDEF(MINT_MOV_DST_OFF, "mov.dst.off", 6, 1, 1, MintOpTwoShorts) +IROPDEF(MINT_PHI, "phi", 2, 1, 0, MintOpNoArgs) #ifdef __DEFINED_IROPDEF__ #undef IROPDEF diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index aacacbbbe86432..ce16148dc5e00f 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -769,10 +769,66 @@ interp_compute_global_vars (TransformData *td) } } +static gboolean +bb_has_phi (InterpBasicBlock *bb, int var) +{ + InterpInst *ins = bb->first_ins; + while (ins) { + if (ins->opcode == MINT_PHI) { + if (ins->dreg == var) + return TRUE; + } else { + // if we have a phi it is at the start of the bb + return FALSE; + } + ins = ins->next; + } + return FALSE; +} + static void -insert_phi_nodes () +bb_insert_phi (TransformData *td, InterpBasicBlock *bb, int var) { - // TODO + InterpInst *phi = interp_insert_ins_bb (td, bb, NULL, MINT_PHI); + if (td->verbose_level) + g_print ("BB%d NEW_PHI %d\n", bb->index, var); + + phi->dreg = var; + phi->info.args = (int*)mono_mempool_alloc (td->mempool, (bb->in_count + 1) * sizeof (int)); + int i; + for (i = 0; i < bb->in_count; i++) + phi->info.args [i] = var; + phi->info.args [i] = -1; +} + +static void +insert_phi_nodes (TransformData *td) +{ + if (td->verbose_level) + g_print ("\nINSERT PHI NODES:\n"); + for (int i = 0; i < td->vars_size; i++) { + if (!td->vars [i].ssa_global) + continue; + + // For every definition of this var, we add a phi node at the start of + // all bblocks in the dominance frontier of the defining bblock. + GSList *workset = g_slist_copy (td->vars [i].declare_bbs); + while (workset) { + GSList *old_head = workset; + InterpBasicBlock *bb = (InterpBasicBlock*)workset->data; + workset = workset->next; + g_free (old_head); + int j; + mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { + InterpBasicBlock *bd = td->bblocks [j]; + if (!bb_has_phi (bd, i)) { + bb_insert_phi (td, bd, i); + if (!g_slist_find (workset, bd)) + workset = g_slist_prepend (workset, bd); + } + } + } + } } static void @@ -788,7 +844,7 @@ interp_compute_ssa (TransformData *td) interp_compute_global_vars (td); - insert_phi_nodes (); + insert_phi_nodes (td); rename_vars (); } @@ -796,6 +852,18 @@ interp_compute_ssa (TransformData *td) static void interp_exit_ssa (TransformData *td) { + // Remove all MINT_PHI opcodes + for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + InterpInst *ins; + for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (ins->opcode == MINT_PHI) + ins->opcode = MINT_NOP; + else + break; + } + } + + // Free memory and restore state for (unsigned int i = 0; i < td->vars_size; i++) { if (td->vars [i].declare_bbs) { g_slist_free (td->vars [i].declare_bbs); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 3aa894b4775000..dc6b80a188f1cf 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1441,7 +1441,14 @@ interp_dump_ins (InterpInst *ins, gpointer *data_items) else g_string_append_printf (str, " [nil <-"); - if (mono_interp_op_sregs [opcode] > 0) { + if (opcode == MINT_PHI) { + int *args = ins->info.args; + while (*args != -1) { + g_string_append_printf (str, " %d", *args); + args++; + } + g_string_append_printf (str, "],"); + } else if (mono_interp_op_sregs [opcode] > 0) { for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) { if (ins->sregs [i] == MINT_CALL_ARGS_SREG) { g_string_append_printf (str, " c:"); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 219f7defe3f67e..6b2e733f799348 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -78,6 +78,7 @@ struct _InterpInst { InterpBasicBlock *target_bb; InterpBasicBlock **target_bb_table; InterpCallInfo *call_info; + int *args; // for variable number of args, used only for phi } info; // Variable data immediately following the dreg/sreg information. This is represented exactly // in the final code stream as in this array. From 909d2d942c2fcb6e103e46a36afb23d4fbf4b3bb Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 7 Nov 2023 13:00:21 +0200 Subject: [PATCH 03/45] [mono][interp] Implement renaming of vars We add a new table of 'InterpRenambleVar` to `TransformData`. Each `InterpVar` now has an index. If the var is renamable, this index points to renamable vars table, otherwise it is -1. A renamable var points back to the original var. This table allows for fast traversal of the subset of vars that might be renamed, avoid storing unnecessary information in the normal var table and it allows for renamed vars to point back to the original var. All renamed vars of an original var point to the same renamable var index. The renaming algorithm uses a renamed var stack, for each renamable var. This stack is initialized with the original var (this is necessary for args and il locals which start with an implicit value). We then recursively traverse the dominator tree in DFS fashion, creating a new var for every redefinition of a renamable var and updating all uses of a renamable var with the current definition, while keeping the renamed var stack updated with the current visible definition. A renamable var can be ssa fixed. SSA fixed vars will be renamed back to the original var when we exit SSA. All vars that are sregs for phi opcodes need to be fixed, because we simply remove phi nodes without generating any intermediary moves (which would be costly on the interpreter). All IL locals and args are currently fixed, since we expect identical structure between tiered and untiered methods during at patchpoint. When we do the SSA transformation, some vars might be left out of SSA form (currently these are var that have indirects). These vars will not have a unique definition and optimizations will not be able to operate on them. --- src/mono/mono/mini/interp/transform-opt.c | 188 +++++++++++++++++++--- src/mono/mono/mini/interp/transform.c | 47 +++++- src/mono/mono/mini/interp/transform.h | 22 ++- 3 files changed, 234 insertions(+), 23 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index ce16148dc5e00f..1930467b5a30b5 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -720,16 +720,29 @@ interp_compute_dominance (TransformData *td) * SSA TRANSFORMATION */ +static gboolean +var_is_ssa_form (TransformData *td, int var) +{ + if (td->vars [var].no_ssa) + return FALSE; + + return TRUE; +} + static void compute_global_var_cb (TransformData *td, int *pvar, gpointer data) { int var = *pvar; InterpBasicBlock *bb = (InterpBasicBlock*)data; InterpVar *var_data = &td->vars [var]; + if (!var_is_ssa_form (td, var)) + return; // If var is used in another block than the one that it is declared then mark it as global if (var_data->declare_bbs) { - if (var_data->declare_bbs->data != bb || var_data->declare_bbs->next) - var_data->ssa_global = TRUE; + if (var_data->declare_bbs->data != bb || var_data->declare_bbs->next) { + int ext_index = interp_create_renamable_var (td, var); + td->renamable_vars [ext_index].ssa_global = TRUE; + } } } @@ -737,28 +750,39 @@ compute_global_var_cb (TransformData *td, int *pvar, gpointer data) static void interp_compute_global_vars (TransformData *td) { + for (int i = 0; i < td->vars_size; i++) { + if (td->vars [i].indirects > 0) + td->vars [i].no_ssa = TRUE; + else + td->vars [i].no_ssa = FALSE; + } + InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { interp_foreach_ins_svar (td, ins, bb, compute_global_var_cb); - if (mono_interp_op_dregs [ins->opcode]) { + if (mono_interp_op_dregs [ins->opcode] && var_is_ssa_form (td, ins->dreg)) { // Save the list of bblocks where a global var is defined in InterpVar *var_data = &td->vars [ins->dreg]; - if (!var_data->declare_bbs) + if (!var_data->declare_bbs) { var_data->declare_bbs = g_slist_prepend (NULL, bb); - else if (!g_slist_find (var_data->declare_bbs, bb)) - var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); + } else { + interp_create_renamable_var (td, ins->dreg); + if (!g_slist_find (var_data->declare_bbs, bb)) + var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); + } } } } if (td->verbose_level) { g_print ("\nSSA GLOBALS:\n"); - for (unsigned int i = 0; i < td->vars_size; i++) { - if (td->vars [i].ssa_global) { - g_print ("DECLARE_BB (%d) = {", i); - GSList *l = td->vars [i].declare_bbs; + for (unsigned int i = 0; i < td->renamable_vars_size; i++) { + if (td->renamable_vars [i].ssa_global) { + int var = td->renamable_vars [i].var_index; + g_print ("DECLARE_BB (%d) = {", var); + GSList *l = td->vars [var].declare_bbs; while (l) { g_print (" BB%d", ((InterpBasicBlock*)l->data)->index); l = l->next; @@ -806,13 +830,14 @@ insert_phi_nodes (TransformData *td) { if (td->verbose_level) g_print ("\nINSERT PHI NODES:\n"); - for (int i = 0; i < td->vars_size; i++) { - if (!td->vars [i].ssa_global) + for (unsigned int i = 0; i < td->renamable_vars_size; i++) { + if (!td->renamable_vars [i].ssa_global) continue; // For every definition of this var, we add a phi node at the start of // all bblocks in the dominance frontier of the defining bblock. - GSList *workset = g_slist_copy (td->vars [i].declare_bbs); + int var = td->renamable_vars [i].var_index; + GSList *workset = g_slist_copy (td->vars [var].declare_bbs); while (workset) { GSList *old_head = workset; InterpBasicBlock *bb = (InterpBasicBlock*)workset->data; @@ -821,8 +846,9 @@ insert_phi_nodes (TransformData *td) int j; mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { InterpBasicBlock *bd = td->bblocks [j]; - if (!bb_has_phi (bd, i)) { - bb_insert_phi (td, bd, i); + if (!bb_has_phi (bd, var)) { + td->renamable_vars [i].ssa_fixed = TRUE; + bb_insert_phi (td, bd, var); if (!g_slist_find (workset, bd)) workset = g_slist_prepend (workset, bd); } @@ -831,10 +857,104 @@ insert_phi_nodes (TransformData *td) } } +static int +get_renamed_var (TransformData *td, int var) +{ + g_assert (td->vars [var].ext_index != -1); + int renamed_var = interp_create_var (td, td->vars [var].type); + // Renamed var reference the orignal var through the ext_index + int ext_index = td->vars [var].ext_index; + td->vars [renamed_var].ext_index = ext_index; + td->renamable_vars [ext_index].ssa_stack = g_slist_prepend (td->renamable_vars [ext_index].ssa_stack, (gpointer)(gsize)renamed_var); + return renamed_var; +} + +static void +rename_ins_var_cb (TransformData *td, int *pvar, gpointer data) +{ + int var = *pvar; + int ext_index = td->vars [var].ext_index; + if (ext_index != -1) + *pvar = (int)(gsize)td->renamable_vars [ext_index].ssa_stack->data; +} + +static void +rename_phi_args_in_out_bbs (TransformData *td, InterpBasicBlock *bb) +{ + for (int i = 0; i < bb->out_count; i++) { + InterpBasicBlock *bb_out = bb->out_bb [i]; + + int aindex; + for (aindex = 0; aindex < bb_out->in_count; aindex++) + if (bb_out->in_bb [aindex] == bb) + break; + + for (InterpInst *ins = bb_out->first_ins; ins != NULL; ins = ins->next) { + if (ins->opcode == MINT_PHI) { + int var = ins->info.args [aindex]; + int ext_index = td->vars [var].ext_index; + GSList *stack = td->renamable_vars [ext_index].ssa_stack; + ins->info.args [aindex] = (int)(gsize)stack->data; + } else { + break; + } + } + } +} + +static void +rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) +{ + InterpInst *ins; + + // Rename vars defined with MINT_PHI + for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (ins->opcode == MINT_PHI) + ins->dreg = get_renamed_var (td, ins->dreg); + else + break; + } + + // Use renamed definition for sources + for (; ins != NULL; ins = ins->next) { + interp_foreach_ins_svar (td, ins, NULL, rename_ins_var_cb); + if (mono_interp_op_dregs [ins->opcode] && td->vars [ins->dreg].ext_index != -1) + ins->dreg = get_renamed_var (td, ins->dreg); + } + + rename_phi_args_in_out_bbs (td, bb); + + // Rename recursively every successor of bb in the dominator tree + GSList *dominated = bb->dominated; + while (dominated) { + InterpBasicBlock *dominated_bb = (InterpBasicBlock*)dominated->data; + rename_vars_in_bb (td, dominated_bb); + dominated = dominated->next; + } + + // Pop from the stack any new vars defined in this bblock + for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (mono_interp_op_dregs [ins->opcode]) { + int ext_index = td->vars [ins->dreg].ext_index; + if (ext_index != -1) { + GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; + td->renamable_vars [ext_index].ssa_stack = prev_head->next; + g_free (prev_head); + } + } + } +} + static void -rename_vars () +rename_vars (TransformData *td) { - // TODO + for (unsigned int i = 0; i < td->renamable_vars_size; i++) { + // Initialize the ssa_stack for entry_bb + int var_index = td->renamable_vars [i].var_index; + td->renamable_vars [i].ssa_stack = g_slist_prepend (td->renamable_vars [i].ssa_stack, (gpointer)(gsize)var_index); + } + + rename_vars_in_bb (td, td->entry_bb); } static void @@ -846,20 +966,36 @@ interp_compute_ssa (TransformData *td) insert_phi_nodes (td); - rename_vars (); + rename_vars (td); + + if (td->verbose_level) { + g_print ("\nIR after SSA compute:\n"); + mono_interp_print_td_code (td); + } +} + +static void +revert_ssa_rename_cb (TransformData *td, int *pvar, gpointer data) +{ + int var = *pvar; + int ext_index = td->vars [var].ext_index; + if (ext_index == -1) + return; + if (td->renamable_vars [ext_index].ssa_fixed) + *pvar = td->renamable_vars [ext_index].var_index; } static void interp_exit_ssa (TransformData *td) { - // Remove all MINT_PHI opcodes + // Remove all MINT_PHI opcodes and revert ssa renaming for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { if (ins->opcode == MINT_PHI) ins->opcode = MINT_NOP; else - break; + interp_foreach_ins_var (td, ins, NULL, revert_ssa_rename_cb); } } @@ -877,6 +1013,13 @@ interp_exit_ssa (TransformData *td) bb->dominated = NULL; } } + + for (unsigned int i = 0; i < td->renamable_vars_size; i++) { + if (td->renamable_vars [i].ssa_stack) { + g_slist_free (td->renamable_vars [i].ssa_stack); + td->renamable_vars [i].ssa_stack = NULL; + } + } } /* @@ -2933,5 +3076,10 @@ interp_optimize_code (TransformData *td) interp_compute_ssa (td); interp_exit_ssa (td); + + if (td->verbose_level) { + g_print ("\nOptimized IR:\n"); + mono_interp_print_td_code (td); + } } diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index dc6b80a188f1cf..a2121f9e12894a 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -349,6 +349,33 @@ mono_mint_type (MonoType *type) return -1; } +// This doesn't allocate a new var, but marks the existing var as renamable, +// allocating space for additional var data. +int +interp_create_renamable_var (TransformData *td, int var) +{ + // Check if already allocated + if (td->vars [var].ext_index != -1) + return td->vars [var].ext_index; + + if (td->renamable_vars_size == td->renamable_vars_capacity) { + td->renamable_vars_capacity *= 2; + if (td->renamable_vars_capacity == 0) + td->renamable_vars_capacity = 2; + td->renamable_vars = (InterpRenamableVar*) g_realloc (td->renamable_vars, td->renamable_vars_capacity * sizeof (InterpRenamableVar)); + } + + int ext_index = td->renamable_vars_size; + InterpRenamableVar *ext = &td->renamable_vars [ext_index]; + memset (ext, 0, sizeof (InterpRenamableVar)); + ext->var_index = var; + + td->vars [var].ext_index = ext_index; + + td->renamable_vars_size++; + + return ext_index; +} /* * These are additional locals that can be allocated as we transform the code. @@ -377,6 +404,7 @@ interp_create_var_explicit (TransformData *td, MonoType *type, int size) local->size = size; local->live_start = -1; local->bb_index = -1; + local->ext_index = -1; local->def = NULL; td->vars_size++; @@ -1514,7 +1542,6 @@ mono_interp_print_code (InterpMethod *imethod) void mono_interp_print_td_code (TransformData *td) { - g_print ("Unoptimized IR:\n"); for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) interp_dump_bb (bb, td->data_items); } @@ -4228,6 +4255,10 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars = (InterpVar*)g_malloc0 (num_locals * sizeof (InterpVar)); td->vars_size = num_locals; td->vars_capacity = td->vars_size; + + td->renamable_vars = (InterpRenamableVar*)g_malloc (num_locals * sizeof (InterpRenamableVar)); + td->renamable_vars_size = 0; + td->renamable_vars_capacity = num_locals; offset = 0; /* @@ -4247,10 +4278,15 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [i].indirects = 0; td->vars [i].mt = mt; td->vars [i].def = NULL; + td->vars [i].ext_index = -1; size = mono_interp_type_size (type, mt, &align); td->vars [i].size = size; offset = ALIGN_TO (offset, align); td->vars [i].offset = offset; + if (td->optimized) { + int ext_index = interp_create_renamable_var (td, i); + td->renamable_vars [ext_index].ssa_fixed = TRUE; + } offset += size; } offset = ALIGN_TO (offset, MINT_STACK_ALIGNMENT); @@ -4274,7 +4310,12 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [index].indirects = 0; td->vars [index].mt = mono_mint_type (header->locals [i]); td->vars [index].def = NULL; + td->vars [index].ext_index = -1; td->vars [index].size = size; + if (td->optimized) { + int ext_index = interp_create_renamable_var (td, index); + td->renamable_vars [ext_index].ssa_fixed = TRUE; + } // Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals offset += size; } @@ -8807,8 +8848,10 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG if (td->has_localloc) interp_fix_localloc_ret (td); - if (td->verbose_level) + if (td->verbose_level) { + g_print ("\nUnoptimized IR:\n"); mono_interp_print_td_code (td); + } if (td->optimized) { interp_optimize_code (td); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 6b2e733f799348..2c5ea91aae8978 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -184,6 +184,7 @@ typedef struct { int indirects; int offset; int size; + int ext_index; GSList *declare_bbs; union { // live_start and live_end are used by the offset allocator for optimized code @@ -211,9 +212,20 @@ typedef struct { guint unknown_use : 1; guint local_only : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now - guint ssa_global: 1; + guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations } InterpVar; +typedef struct { + int var_index; + GSList *ssa_stack; + + // Var that is global and might take part in phi opcodes + guint ssa_global : 1; + // IL locals/args. Vars included in phi opcodes. All renamed vars are allocated + // to the same offset. Optimizations need to ensure there is no overlapping liveness + guint ssa_fixed : 1; +} InterpRenamableVar; + typedef struct { MonoMethod *method; @@ -247,6 +259,11 @@ typedef struct unsigned int vars_size; unsigned int vars_capacity; + // Additional information for vars that are renamable + InterpRenamableVar *renamable_vars; + unsigned int renamable_vars_size; + unsigned int renamable_vars_capacity; + int n_data_items; int max_data_items; void **data_items; @@ -464,6 +481,9 @@ interp_alloc_global_var_offset (TransformData *td, int var); int interp_create_var (TransformData *td, MonoType *type); +int +interp_create_renamable_var (TransformData *td, int var); + void interp_foreach_ins_var (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int*, gpointer)); From 96e92f9be8a3dae0165ea5ae1efecd60db8bcc17 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 7 Nov 2023 14:36:44 +0200 Subject: [PATCH 04/45] [mono][interp] Re-enable bblock optimizations Merging, reordering, removal of dead bblocks. --- src/mono/mono/mini/interp/transform-opt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 1930467b5a30b5..408968780346dd 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3073,6 +3073,9 @@ interp_optimize_code (TransformData *td) if (td->header->num_clauses) return; + if (mono_interp_opt & INTERP_OPT_BBLOCKS) + interp_optimize_bblocks (td); + interp_compute_ssa (td); interp_exit_ssa (td); From d7804e4ba21433707d14322df385d4c83f64a434 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 10:40:30 +0200 Subject: [PATCH 05/45] [mono][interp] Add liveness computation so we can generate pruned SSA We have the subset of renamable vars (which might take part in phi nodes). For every bblock we compute a live_in bitset, indicating whether a renamable var is live at entry to bblock. We generate a phi node for a variable only if that variable is live at entry to the bblock. On System.Runtime.Tests suite this reduces number of phi nodes by 20x, so it seems worthwhile. --- src/mono/mono/mini/interp/transform-opt.c | 111 +++++++++++++++++++++- src/mono/mono/mini/interp/transform.h | 6 ++ 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 408968780346dd..fcc24ebae812b5 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -793,6 +793,113 @@ interp_compute_global_vars (TransformData *td) } } +static void +compute_gen_set_cb (TransformData *td, int *pvar, gpointer data) +{ + int var = *pvar; + InterpBasicBlock *bb = (InterpBasicBlock*)data; + + int ext_index = td->vars [var].ext_index; + if (ext_index == -1) + return; + + if (!td->renamable_vars [ext_index].ssa_global) + return; + + if (!mono_bitset_test_fast (bb->kill_set, ext_index)) + mono_bitset_set_fast (bb->gen_set, ext_index); +} + +// For each bblock, computes the kill set (the set of vars defined by the bblock) +// and gen set (the set of vars used by the bblock, with the definition not being +// in the bblock). +static void +compute_gen_kill_sets (TransformData *td) +{ + int bitsize = mono_bitset_alloc_size (td->renamable_vars_size, 0); + char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bb_count * 4); + + for (int i = 0; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + + bb->gen_set = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); + mem += bitsize; + bb->kill_set = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); + mem += bitsize; + bb->live_in_set = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); + mem += bitsize; + bb->live_out_set = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); + mem += bitsize; + + for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { + interp_foreach_ins_svar (td, ins, bb, compute_gen_set_cb); + if (mono_interp_op_dregs [ins->opcode]) { + int ext_index = td->vars [ins->dreg].ext_index; + if (ext_index != -1 && td->renamable_vars [ext_index].ssa_global) + mono_bitset_set_fast (bb->kill_set, ext_index); + } + } + } +} + +// Compute live_in and live_out sets +// For a bblock, live_in contains all vars that are live at exit of bblock and not redefined, +// together with all vars used in the bblock without being defined. For a bblock, live_out set +// contains all vars that are live_in any successor. This computation starts with empty sets +// (starting to generate live vars from the gen sets) and it is run iteratively until the +// computation converges. +static void +recompute_live_out (TransformData *td, InterpBasicBlock *bb) +{ + for (int i = 0; i < bb->out_count; i++) { + InterpBasicBlock *sbb = bb->out_bb [i]; + + // Recompute live_in_set for each successor of bb + mono_bitset_copyto_fast (sbb->live_out_set, sbb->live_in_set); + mono_bitset_sub_fast (sbb->live_in_set, sbb->kill_set); + mono_bitset_union_fast (sbb->live_in_set, sbb->gen_set); + + // Recompute live_out_set of bb, by adding the live_in_set of each successor + mono_bitset_union_fast (bb->live_out_set, sbb->live_in_set); + } +} + +// For each bblock, compute LiveIn, LiveOut sets tracking liveness for the previously computed global vars +static void +interp_compute_pruned_ssa_liveness (TransformData *td) +{ + compute_gen_kill_sets (td); + + gboolean changed = TRUE; + while (changed) { + changed = FALSE; + for (int i = 0; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + guint32 prev_count = mono_bitset_count (bb->live_out_set); + recompute_live_out (td, bb); + if (prev_count != mono_bitset_count (bb->live_out_set)) + changed = TRUE; + } + } + + if (td->verbose_level) { + InterpBasicBlock *bb; + g_print ("\nBASIC BLOCK LIVENESS:\n"); + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + unsigned int i; + g_print ("BB%d\n\tLIVE_IN = {", bb->index); + mono_bitset_foreach_bit (bb->live_in_set, i, td->renamable_vars_size) { + g_print (" %d", td->renamable_vars [i].var_index); + } + g_print (" }\n\tLIVE_OUT = {", bb->index); + mono_bitset_foreach_bit (bb->live_out_set, i, td->renamable_vars_size) { + g_print (" %d", td->renamable_vars [i].var_index); + } + g_print (" }\n"); + } + } +} + static gboolean bb_has_phi (InterpBasicBlock *bb, int var) { @@ -846,7 +953,7 @@ insert_phi_nodes (TransformData *td) int j; mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { InterpBasicBlock *bd = td->bblocks [j]; - if (!bb_has_phi (bd, var)) { + if (!bb_has_phi (bd, var) && mono_bitset_test_fast (bd->live_in_set, i)) { td->renamable_vars [i].ssa_fixed = TRUE; bb_insert_phi (td, bd, var); if (!g_slist_find (workset, bd)) @@ -964,6 +1071,8 @@ interp_compute_ssa (TransformData *td) interp_compute_global_vars (td); + interp_compute_pruned_ssa_liveness (td); + insert_phi_nodes (td); rename_vars (td); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 2c5ea91aae8978..1c187778e0e046 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -108,6 +108,12 @@ struct _InterpBasicBlock { /* List of bblocks that are immediately dominated by this bblock */ GSList *dominated; + /* Live variable analysis, for vars in locals_ext */ + MonoBitSet *gen_set; + MonoBitSet *kill_set; + MonoBitSet *live_in_set; + MonoBitSet *live_out_set; + /* The real native offset of this bblock, computed when emitting the instructions in the code stream */ int native_offset; /* From 45dd0098dad9946a5715c140d9fc0924e466149f Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 11:13:53 +0200 Subject: [PATCH 06/45] [mono][interp] Reduce number of fixed ssa vars Previous to this commit we were marking each IL local as fixed, since it might take part of the tiering patchpoint state. Since we now have live_in information already computed, we can mark only IL locals that are live at the start of bblock that inserts a tiering patchpoing. For methods without patchpoints we also no longer mark any fixed vars from IL locals. --- src/mono/mono/mini/interp/transform-opt.c | 30 +++++++++++++++++++++++ src/mono/mono/mini/interp/transform.c | 17 ++++--------- src/mono/mono/mini/interp/transform.h | 3 ++- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index fcc24ebae812b5..944cc92f31b899 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -757,6 +757,17 @@ interp_compute_global_vars (TransformData *td) td->vars [i].no_ssa = FALSE; } + // For locals (which are normally initlocal) and arguments, consider them already + // defined in entry_bb + for (unsigned int i = 0; i < td->vars_size; i++) { + if (td->vars [i].il_global) { + td->vars [i].declare_bbs = g_slist_prepend (NULL, td->entry_bb); + } else { + // IL globals are the first vars + break; + } + } + InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; @@ -964,6 +975,23 @@ insert_phi_nodes (TransformData *td) } } +// Additional fixed vars, in addition to vars that are args to phi nodes +static void +compute_fixed_vars (TransformData *td) +{ + for (int i = 0; i < td->bblocks_count; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + if (!bb->patchpoint_bb) + continue; + // All IL locals live at entry to this bb have to be fixed + for (unsigned int k = 0; k < td->renamable_vars_size; k++) { + int var_index = td->renamable_vars [k].var_index; + if (td->vars [var_index].il_global && mono_bitset_test_fast (bb->live_in_set, k)) + td->renamable_vars [k].ssa_fixed = TRUE; + } + } +} + static int get_renamed_var (TransformData *td, int var) { @@ -1075,6 +1103,8 @@ interp_compute_ssa (TransformData *td) insert_phi_nodes (td); + compute_fixed_vars (td); + rename_vars (td); if (td->verbose_level) { diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index a2121f9e12894a..0fcd76c2f94250 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -687,8 +687,7 @@ handle_branch (TransformData *td, int long_op, int offset) if (offset < 0 && td->sp == td->stack && !td->inlined_method) { // Backwards branch inside unoptimized method where the IL stack is empty // This is candidate for a patchpoint - if (!td->optimized) - target_bb->emit_patchpoint = TRUE; + target_bb->patchpoint_bb = TRUE; if (mono_interp_tiering_enabled () && !target_bb->patchpoint_data && td->optimized) { // The optimized imethod will store mapping from bb index to native offset so it // can resume execution in the optimized method, once we tier up in patchpoint @@ -4275,6 +4274,7 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet int mt = mono_mint_type (type); td->vars [i].type = type; td->vars [i].global = TRUE; + td->vars [i].il_global = TRUE; td->vars [i].indirects = 0; td->vars [i].mt = mt; td->vars [i].def = NULL; @@ -4283,10 +4283,6 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [i].size = size; offset = ALIGN_TO (offset, align); td->vars [i].offset = offset; - if (td->optimized) { - int ext_index = interp_create_renamable_var (td, i); - td->renamable_vars [ext_index].ssa_fixed = TRUE; - } offset += size; } offset = ALIGN_TO (offset, MINT_STACK_ALIGNMENT); @@ -4307,15 +4303,12 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [index].type = header->locals [i]; td->vars [index].offset = offset; td->vars [index].global = TRUE; + td->vars [index].il_global = TRUE; td->vars [index].indirects = 0; td->vars [index].mt = mono_mint_type (header->locals [i]); td->vars [index].def = NULL; td->vars [index].ext_index = -1; td->vars [index].size = size; - if (td->optimized) { - int ext_index = interp_create_renamable_var (td, index); - td->renamable_vars [ext_index].ssa_fixed = TRUE; - } // Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals offset += size; } @@ -8302,7 +8295,7 @@ interp_compute_native_offset_estimates (TransformData *td) for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; bb->native_offset_estimate = noe; - if (bb->emit_patchpoint) + if (!td->optimized && bb->patchpoint_bb) noe += 2; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { @@ -8648,7 +8641,7 @@ generate_compacted_code (InterpMethod *rtm, TransformData *td) if (bb->patchpoint_data) patchpoint_data_index = add_patchpoint_data (td, patchpoint_data_index, bb->native_offset, bb->index); - if (bb->emit_patchpoint) { + if (!td->optimized && bb->patchpoint_bb) { // Add patchpoint in unoptimized method *ip++ = MINT_TIER_PATCHPOINT; *ip++ = (guint16)bb->index; diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 1c187778e0e046..8046e35fcf7d8d 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -149,7 +149,7 @@ struct _InterpBasicBlock { // InterpMethod. In the unoptimized method we will map from native offset to the bb_index while in the // optimized method we will map the bb_index to the corresponding native offset. guint patchpoint_data: 1; - guint emit_patchpoint: 1; + guint patchpoint_bb: 1; // used by jiterpreter guint backwards_branch_target: 1; guint contains_call_instruction: 1; @@ -219,6 +219,7 @@ typedef struct { guint local_only : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations + guint il_global : 1; // Args and IL locals } InterpVar; typedef struct { From 00a772d347449f5c83c364dc54716d4508f5828d Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 12:25:33 +0200 Subject: [PATCH 07/45] [mono][interp] Add more stats tracking compilation time Remove stats registering, it is a NOP for a while now. --- src/mono/mono/mini/interp/interp-internals.h | 7 +++++++ src/mono/mono/mini/interp/interp.c | 15 --------------- src/mono/mono/mini/interp/transform-opt.c | 12 ++++++------ src/mono/mono/mini/interp/transform.c | 2 +- 4 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index 72ef097eac22d7..8c4fe67b002abf 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -275,6 +275,13 @@ typedef struct { typedef struct { gint64 transform_time; gint64 methods_transformed; + gint64 optimize_time; + gint64 ssa_compute_time; + gint64 ssa_compute_dominance_time; + gint64 ssa_compute_global_vars_time; + gint64 ssa_compute_pruned_liveness_time; + gint64 ssa_rename_vars_time; + gint64 optimize_bblocks_time; gint64 cprop_time; gint64 super_instructions_time; gint32 emitted_instructions; diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 23bf79620a73e2..6e62cfe8844486 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -8682,19 +8682,6 @@ interp_cleanup (void) #endif } -static void -register_interp_stats (void) -{ - mono_counters_init (); - mono_counters_register ("Total transform time", MONO_COUNTER_INTERP | MONO_COUNTER_LONG | MONO_COUNTER_TIME, &mono_interp_stats.transform_time); - mono_counters_register ("Methods transformed", MONO_COUNTER_INTERP | MONO_COUNTER_LONG, &mono_interp_stats.methods_transformed); - mono_counters_register ("Total cprop time", MONO_COUNTER_INTERP | MONO_COUNTER_LONG | MONO_COUNTER_TIME, &mono_interp_stats.cprop_time); - mono_counters_register ("Total super instructions time", MONO_COUNTER_INTERP | MONO_COUNTER_LONG | MONO_COUNTER_TIME, &mono_interp_stats.super_instructions_time); - mono_counters_register ("Emitted instructions", MONO_COUNTER_INTERP | MONO_COUNTER_INT, &mono_interp_stats.emitted_instructions); - mono_counters_register ("Methods inlined", MONO_COUNTER_INTERP | MONO_COUNTER_INT, &mono_interp_stats.inlined_methods); - mono_counters_register ("Inline failures", MONO_COUNTER_INTERP | MONO_COUNTER_INT, &mono_interp_stats.inline_failures); -} - #undef MONO_EE_CALLBACK #define MONO_EE_CALLBACK(ret, name, sig) interp_ ## name, @@ -8723,8 +8710,6 @@ mono_ee_interp_init (const char *opts) mini_install_interp_callbacks (&mono_interp_callbacks); - register_interp_stats (); - #ifdef HOST_WASI debugger_enabled = mini_get_debug_options ()->mdb_optimizations; #endif diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 944cc92f31b899..50051792bae5d7 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1095,17 +1095,17 @@ rename_vars (TransformData *td) static void interp_compute_ssa (TransformData *td) { - interp_compute_dominance (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_dominance_time, interp_compute_dominance (td)); - interp_compute_global_vars (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_global_vars_time, interp_compute_global_vars (td)); - interp_compute_pruned_ssa_liveness (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_pruned_liveness_time, interp_compute_pruned_ssa_liveness (td)); insert_phi_nodes (td); compute_fixed_vars (td); - rename_vars (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_rename_vars_time, rename_vars (td)); if (td->verbose_level) { g_print ("\nIR after SSA compute:\n"); @@ -3213,9 +3213,9 @@ interp_optimize_code (TransformData *td) return; if (mono_interp_opt & INTERP_OPT_BBLOCKS) - interp_optimize_bblocks (td); + MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); - interp_compute_ssa (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); interp_exit_ssa (td); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 0fcd76c2f94250..29af952c0656b5 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8847,7 +8847,7 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG } if (td->optimized) { - interp_optimize_code (td); + MONO_TIME_TRACK (mono_interp_stats.optimize_time, interp_optimize_code (td)); interp_alloc_offsets (td); #if HOST_BROWSER if (mono_interp_opt & INTERP_OPT_JITERPRETER) From 7366107d4621e1b6de8c5b894a4013dda774da4e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 13:33:14 +0200 Subject: [PATCH 08/45] [mono][interp] Compute live end limit for fixed ssa vars Renamed vars of the same fixed original vars have the constraint that only one of them can be alive at any point, otherwise, when we revert renaming for this vars the logic is broken. Future optimizations, like copy propagation, can prolong the life of var and we will need to ensure that this doesn't end up overlapping with another renamed var. In order to support this, for each renamed fixed ssa var, we remember bblocks where the var is live at bblock exit and, in bblocks where it is renamed, we remember the location where this happens (meaning we can extend the liveness of the previous definition up to this point). This is done together with renaming, where the necessary information is readily available. This information need to be stored spearately for each renamed var, so we create a new extended table for this, `renamed_fixed_vars`. Each renamed var of a fixed var, will have the `ext_index` point to this table. Since we still need to be able to obtain the original var, `InterpRenamedFixedVar` also has an index into the renamable var table. We hold liveness information into an int32 with 14 bits for bb_index and 18 bits for instruction index. When we first compute liveness information, each instruction that bumps the liveness index will be flagged. All future optimizations, as they reiterate over the code, will then check this flag to correctly recompute the liveness index (newly added instructions won't have this flag set while deleted instructions are only NOP-ed so they preserve the flag) --- src/mono/mono/mini/interp/transform-opt.c | 122 +++++++++++++++++++--- src/mono/mono/mini/interp/transform.c | 30 ++++++ src/mono/mono/mini/interp/transform.h | 32 ++++++ 3 files changed, 170 insertions(+), 14 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 50051792bae5d7..3dc8446e2c4011 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -995,11 +995,17 @@ compute_fixed_vars (TransformData *td) static int get_renamed_var (TransformData *td, int var) { - g_assert (td->vars [var].ext_index != -1); - int renamed_var = interp_create_var (td, td->vars [var].type); - // Renamed var reference the orignal var through the ext_index int ext_index = td->vars [var].ext_index; - td->vars [renamed_var].ext_index = ext_index; + g_assert (ext_index != -1); + int renamed_var = interp_create_var (td, td->vars [var].type); + + if (td->renamable_vars [ext_index].ssa_fixed) { + td->vars [renamed_var].renamed_ssa_fixed = TRUE; + interp_create_renamed_fixed_var (td, renamed_var, var); + } else { + // Renamed var reference the orignal var through the ext_index + td->vars [renamed_var].ext_index = ext_index; + } td->renamable_vars [ext_index].ssa_stack = g_slist_prepend (td->renamable_vars [ext_index].ssa_stack, (gpointer)(gsize)renamed_var); return renamed_var; } @@ -1050,11 +1056,29 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) break; } + guint32 current_liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + // Use renamed definition for sources for (; ins != NULL; ins = ins->next) { + if (interp_ins_is_nop (ins)) + continue; + ins->flags |= INTERP_INST_FLAG_LIVENESS_MARKER; + current_liveness++; + interp_foreach_ins_svar (td, ins, NULL, rename_ins_var_cb); - if (mono_interp_op_dregs [ins->opcode] && td->vars [ins->dreg].ext_index != -1) + if (mono_interp_op_dregs [ins->opcode] && td->vars [ins->dreg].ext_index != -1) { + g_assert (!td->vars [ins->dreg].renamed_ssa_fixed); + int renamable_ext_index = td->vars [ins->dreg].ext_index; + if (td->renamable_vars [renamable_ext_index].ssa_fixed && + td->renamable_vars [renamable_ext_index].ssa_stack) { + // Mark the exact liveness end limit for the ssa fixed var that is overwritten (the old entry on the stack) + int renamed_var = (int)(gsize)td->renamable_vars [renamable_ext_index].ssa_stack->data; + g_assert (td->vars [renamed_var].renamed_ssa_fixed); + int renamed_var_ext = td->vars [renamed_var].ext_index; + td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, (gpointer)(gsize)current_liveness); + } ins->dreg = get_renamed_var (td, ins->dreg); + } } rename_phi_args_in_out_bbs (td, bb); @@ -1067,15 +1091,33 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) dominated = dominated->next; } + // All vars currently on the ssa stack are live until the end of the bblock + for (unsigned int i = 0; i < td->renamable_vars_size; i++) { + if (td->renamable_vars [i].ssa_fixed && td->renamable_vars [i].ssa_stack) { + int renamed_var = (int)(gsize)td->renamable_vars [i].ssa_stack->data; + g_assert (td->vars [renamed_var].renamed_ssa_fixed); + int renamed_var_ext = td->vars [renamed_var].ext_index; + if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { + gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bb_count, 0)); + td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bb_count, 0); + } + + mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->index); + } + } + // Pop from the stack any new vars defined in this bblock for (ins = bb->first_ins; ins != NULL; ins = ins->next) { if (mono_interp_op_dregs [ins->opcode]) { int ext_index = td->vars [ins->dreg].ext_index; - if (ext_index != -1) { - GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; - td->renamable_vars [ext_index].ssa_stack = prev_head->next; - g_free (prev_head); - } + if (ext_index == -1) + continue; + if (td->vars [ins->dreg].renamed_ssa_fixed) + ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; + + GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; + td->renamable_vars [ext_index].ssa_stack = prev_head->next; + g_free (prev_head); } } } @@ -1083,18 +1125,56 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) static void rename_vars (TransformData *td) { + // Initialize the ssa_stack for entry_bb for (unsigned int i = 0; i < td->renamable_vars_size; i++) { - // Initialize the ssa_stack for entry_bb + // Initialize the ssa_stack for entry_bb. If var is fixed, then we need to create a new + // InterpRenamedFixedVar, since we will have to track liveness. Otherwise just push the + // original var directly. int var_index = td->renamable_vars [i].var_index; - td->renamable_vars [i].ssa_stack = g_slist_prepend (td->renamable_vars [i].ssa_stack, (gpointer)(gsize)var_index); + if (td->renamable_vars [i].ssa_fixed) + get_renamed_var (td, var_index); + else + td->renamable_vars [i].ssa_stack = g_slist_prepend (td->renamable_vars [i].ssa_stack, (gpointer)(gsize)var_index); } rename_vars_in_bb (td, td->entry_bb); + + if (td->verbose_level) { + g_print ("\nFIXED SSA VARS LIVENESS LIMIT:\n"); + for (unsigned int i = 0; i < td->renamed_fixed_vars_size; i++) { + g_print ("FIXED VAR %d\n\tNO LIVE LIMIT BBLOCKS: {", td->renamed_fixed_vars [i].var_index); + MonoBitSet *live_out_bblocks = td->renamed_fixed_vars [i].live_out_bblocks; + if (live_out_bblocks) { + int j; + mono_bitset_foreach_bit (live_out_bblocks, j, td->bb_count) { + g_print (" BB%d", j); + } + } + g_print (" }\n"); + g_print ("\tLIVE LIMIT BBLOCKS: {"); + GSList *live_limit_bblocks = td->renamed_fixed_vars [i].live_limit_bblocks; + while (live_limit_bblocks) { + guint32 live_limit = (guint32)(gsize)live_limit_bblocks->data; + int bb_index = live_limit >> INTERP_LIVENESS_INS_INDEX_BITS; + int inst_index = live_limit & INTERP_LIVENESS_INS_INDEX_MASK; + + g_print (" (BB%d, %d)", bb_index, inst_index); + + live_limit_bblocks = live_limit_bblocks->next; + } + g_print (" }\n"); + } + } } static void interp_compute_ssa (TransformData *td) { + if (td->verbose_level) { + g_print ("\nIR before SSA compute:\n"); + mono_interp_print_td_code (td); + } + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_dominance_time, interp_compute_dominance (td)); MONO_TIME_TRACK (mono_interp_stats.ssa_compute_global_vars_time, interp_compute_global_vars (td)); @@ -1120,8 +1200,11 @@ revert_ssa_rename_cb (TransformData *td, int *pvar, gpointer data) int ext_index = td->vars [var].ext_index; if (ext_index == -1) return; - if (td->renamable_vars [ext_index].ssa_fixed) - *pvar = td->renamable_vars [ext_index].var_index; + + if (td->vars [var].renamed_ssa_fixed) { + int renamable_var_ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; + *pvar = td->renamable_vars [renamable_var_ext_index].var_index; + } } static void @@ -1159,6 +1242,13 @@ interp_exit_ssa (TransformData *td) td->renamable_vars [i].ssa_stack = NULL; } } + + for (unsigned int i = 0; i < td->renamed_fixed_vars_size; i++) { + if (td->renamed_fixed_vars [i].live_limit_bblocks) { + g_slist_free (td->renamed_fixed_vars [i].live_limit_bblocks); + td->renamed_fixed_vars [i].live_limit_bblocks = NULL; + } + } } /* @@ -3212,6 +3302,10 @@ interp_optimize_code (TransformData *td) if (td->header->num_clauses) return; + // Give up on huge methods. We can easily work around this if decide to care. + if (td->bb_count > ((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1)) + return; + if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 29af952c0656b5..40a122ba5d1a58 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -377,6 +377,36 @@ interp_create_renamable_var (TransformData *td, int var) return ext_index; } +// This doesn't allocate a new var, rather additional information for fixed renamed vars +int +interp_create_renamed_fixed_var (TransformData *td, int var_index, int renamable_var_index) +{ + g_assert (td->vars [renamable_var_index].ext_index != -1); + g_assert (td->vars [var_index].ext_index == -1); + g_assert (td->vars [var_index].renamed_ssa_fixed); + + if (td->renamed_fixed_vars_size == td->renamed_fixed_vars_capacity) { + td->renamed_fixed_vars_capacity *= 2; + if (td->renamed_fixed_vars_capacity == 0) + td->renamed_fixed_vars_capacity = 2; + td->renamed_fixed_vars = (InterpRenamedFixedVar*) g_realloc (td->renamed_fixed_vars, td->renamed_fixed_vars_capacity * sizeof (InterpRenamedFixedVar)); + } + + int ext_index = td->renamed_fixed_vars_size; + InterpRenamedFixedVar *ext = &td->renamed_fixed_vars [ext_index]; + + ext->var_index = var_index; + ext->renamable_var_ext_index = td->vars [renamable_var_index].ext_index; + ext->live_out_bblocks = NULL; + ext->live_limit_bblocks = NULL; + + td->vars [var_index].ext_index = ext_index; + + td->renamed_fixed_vars_size++; + + return ext_index; +} + /* * These are additional locals that can be allocated as we transform the code. * They are allocated past the method locals so they are accessed in the same diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 8046e35fcf7d8d..55820e04b10321 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -14,6 +14,14 @@ #define INTERP_INST_FLAG_ACTIVE_CALL 64 // This instruction is protected by a clause #define INTERP_INST_FLAG_PROTECTED_NEWOBJ 128 +// This instruction bumps the liveness index. Enables liveness checks as new instructions +// are added in the code, since new instructions won't have this flag set. +#define INTERP_INST_FLAG_LIVENESS_MARKER 256 + +#define INTERP_LIVENESS_INS_INDEX_BITS 18 +#define INTERP_LIVENESS_BB_INDEX_BITS (8 * sizeof (gint32) - INTERP_LIVENESS_INS_INDEX_BITS) +#define INTERP_LIVENESS_INS_INDEX_MASK ((1 << INTERP_LIVENESS_INS_INDEX_BITS) - 1) +#define INTERP_LIVENESS_BB_INDEX_MASK (((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1) << INTERP_LIVENESS_INS_INDEX_BITS) typedef struct _InterpInst InterpInst; typedef struct _InterpBasicBlock InterpBasicBlock; @@ -220,6 +228,7 @@ typedef struct { guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations guint il_global : 1; // Args and IL locals + guint renamed_ssa_fixed : 1; // If true, ext_index points to InterpRenamedVar, otherwise to InterpRenamableVar } InterpVar; typedef struct { @@ -233,6 +242,20 @@ typedef struct { guint ssa_fixed : 1; } InterpRenamableVar; +// In addition to InterpRenamableVar information, this stores liveness information that enables us +// to ensure that the liveness of the corresponding var is not overlapping with the other renamed vars, +// after optimization. +typedef struct { + int var_index; + int renamable_var_ext_index; + // Bit set of bblocks where the renamed var is live at the bb end + // This means that within these bblocks we can freely increase the var liveness + MonoBitSet *live_out_bblocks; + // This is a list of (bb_index, inst_index), that indicates that in bblock with + // index bb_index, the var can have its liveness extended to at most inst_index + GSList *live_limit_bblocks; +} InterpRenamedFixedVar; + typedef struct { MonoMethod *method; @@ -271,6 +294,12 @@ typedef struct unsigned int renamable_vars_size; unsigned int renamable_vars_capacity; + // Newly created, renamed vars of fixed vars. We compute liveness on this subset + // of vars so we ensure we don't have conflicting liveness. + unsigned int renamed_fixed_vars_size; + unsigned int renamed_fixed_vars_capacity; + InterpRenamedFixedVar *renamed_fixed_vars; + int n_data_items; int max_data_items; void **data_items; @@ -491,6 +520,9 @@ interp_create_var (TransformData *td, MonoType *type); int interp_create_renamable_var (TransformData *td, int var); +int +interp_create_renamed_fixed_var (TransformData *td, int var_index, int renamable_var_index); + void interp_foreach_ins_var (TransformData *td, InterpInst *ins, gpointer data, void (*callback)(TransformData*, int*, gpointer)); From 4104c95475102ffc58d6ce135061e6cfc4abc898 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 19:05:15 +0200 Subject: [PATCH 09/45] [mono][interp] Ensure all vars have a definition In SSA transformation and optimizations we would have to special case args and locals since they can have a value without being defined. To avoid these hacks, make sure every arg is defined via a MINT_DEF_ARG instruction (which will be removed when code is actually generated) and every local is initialized via MINT_INITLOCAL (if var is initialized this instruction will end up optimized away). Because MINT_DEF_ARG is optimized away, vars that are renamed from it will have to be reverted to the actual argument. --- src/mono/mono/mini/interp/mintops.def | 1 + src/mono/mono/mini/interp/transform-opt.c | 39 ++++++------------ src/mono/mono/mini/interp/transform.c | 50 ++++++++++++++--------- src/mono/mono/mini/interp/transform.h | 1 + 4 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index da619fb4ecadd8..f26a9ee732d1b9 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -834,6 +834,7 @@ OPDEF(MINT_TIER_MONITOR_JITERPRETER, "tier_monitor_jiterpreter", 4, 0, 0, MintOp IROPDEF(MINT_NOP, "nop", 1, 0, 0, MintOpNoArgs) IROPDEF(MINT_DEF, "def", 2, 1, 0, MintOpNoArgs) +IROPDEF(MINT_DEF_ARG, "def_arg", 2, 1, 0, MintOpNoArgs) IROPDEF(MINT_IL_SEQ_POINT, "il_seq_point", 1, 0, 0, MintOpNoArgs) IROPDEF(MINT_DUMMY_USE, "dummy_use", 2, 0, 1, MintOpNoArgs) IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortInt) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 3dc8446e2c4011..ad387058e95d39 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -757,17 +757,6 @@ interp_compute_global_vars (TransformData *td) td->vars [i].no_ssa = FALSE; } - // For locals (which are normally initlocal) and arguments, consider them already - // defined in entry_bb - for (unsigned int i = 0; i < td->vars_size; i++) { - if (td->vars [i].il_global) { - td->vars [i].declare_bbs = g_slist_prepend (NULL, td->entry_bb); - } else { - // IL globals are the first vars - break; - } - } - InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; @@ -993,11 +982,12 @@ compute_fixed_vars (TransformData *td) } static int -get_renamed_var (TransformData *td, int var) +get_renamed_var (TransformData *td, int var, gboolean def_arg) { int ext_index = td->vars [var].ext_index; g_assert (ext_index != -1); int renamed_var = interp_create_var (td, td->vars [var].type); + td->vars [renamed_var].def_arg = def_arg; if (td->renamable_vars [ext_index].ssa_fixed) { td->vars [renamed_var].renamed_ssa_fixed = TRUE; @@ -1051,7 +1041,7 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) // Rename vars defined with MINT_PHI for (ins = bb->first_ins; ins != NULL; ins = ins->next) { if (ins->opcode == MINT_PHI) - ins->dreg = get_renamed_var (td, ins->dreg); + ins->dreg = get_renamed_var (td, ins->dreg, FALSE); else break; } @@ -1066,7 +1056,12 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) current_liveness++; interp_foreach_ins_svar (td, ins, NULL, rename_ins_var_cb); - if (mono_interp_op_dregs [ins->opcode] && td->vars [ins->dreg].ext_index != -1) { + if (!mono_interp_op_dregs [ins->opcode] || td->vars [ins->dreg].ext_index == -1) + continue; + + if (ins->opcode == MINT_DEF_ARG) { + ins->dreg = get_renamed_var (td, ins->dreg, TRUE); + } else if (mono_interp_op_dregs [ins->opcode]) { g_assert (!td->vars [ins->dreg].renamed_ssa_fixed); int renamable_ext_index = td->vars [ins->dreg].ext_index; if (td->renamable_vars [renamable_ext_index].ssa_fixed && @@ -1077,7 +1072,7 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) int renamed_var_ext = td->vars [renamed_var].ext_index; td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, (gpointer)(gsize)current_liveness); } - ins->dreg = get_renamed_var (td, ins->dreg); + ins->dreg = get_renamed_var (td, ins->dreg, FALSE); } } @@ -1125,18 +1120,6 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) static void rename_vars (TransformData *td) { - // Initialize the ssa_stack for entry_bb - for (unsigned int i = 0; i < td->renamable_vars_size; i++) { - // Initialize the ssa_stack for entry_bb. If var is fixed, then we need to create a new - // InterpRenamedFixedVar, since we will have to track liveness. Otherwise just push the - // original var directly. - int var_index = td->renamable_vars [i].var_index; - if (td->renamable_vars [i].ssa_fixed) - get_renamed_var (td, var_index); - else - td->renamable_vars [i].ssa_stack = g_slist_prepend (td->renamable_vars [i].ssa_stack, (gpointer)(gsize)var_index); - } - rename_vars_in_bb (td, td->entry_bb); if (td->verbose_level) { @@ -1204,6 +1187,8 @@ revert_ssa_rename_cb (TransformData *td, int *pvar, gpointer data) if (td->vars [var].renamed_ssa_fixed) { int renamable_var_ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; *pvar = td->renamable_vars [renamable_var_ext_index].var_index; + } else if (td->vars [var].def_arg) { + *pvar = td->renamable_vars [ext_index].var_index; } } diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 40a122ba5d1a58..2d56e55d4025bb 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -4858,6 +4858,14 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, g_free (name); } + if (td->optimized) { + // Add arg defining instructions for SSA machinery + for (int i = 0; i < num_args; i++) { + interp_add_ins (td, MINT_DEF_ARG); + interp_ins_set_dreg (td->last_ins, i); + } + } + if (rtm->vararg) { // vararg calls are identical to normal calls on the call site. However, the // first instruction in a vararg method needs to copy the variable arguments @@ -4873,23 +4881,6 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, td->has_localloc = TRUE; } - /* - * We initialize the locals regardless of the presence of the init_locals - * flag. Locals holding references need to be zeroed so we don't risk - * crashing the GC if they end up being stored in an object. - * - * FIXME - * Track values of locals over multiple basic blocks. This would enable - * us to kill the MINT_INITLOCALS instruction if all locals are initialized - * before use. We also don't need this instruction if the init locals flag - * is not set and there are no locals holding references. - */ - if (header->num_locals) { - interp_add_ins (td, MINT_INITLOCALS); - td->last_ins->data [0] = GUINT_TO_UINT16 (td->il_locals_offset); - td->last_ins->data [1] = GUINT_TO_UINT16 (td->il_locals_size); - } - guint16 enter_profiling = 0; if (mono_jit_trace_calls != NULL && mono_trace_eval (method)) enter_profiling |= TRACING_FLAG; @@ -4933,11 +4924,32 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, } local_locals = (guint32*) g_malloc (header->num_locals * sizeof (guint32)); - /* Allocate locals to store inlined method args from stack */ for (int i = 0; i < header->num_locals; i++) local_locals [i] = interp_create_var (td, header->locals [i]); } + /* + * We initialize the locals regardless of the presence of the init_locals + * flag. Locals holding references need to be zeroed so we don't risk + * crashing the GC if they end up being stored in an object. + */ + if (header->num_locals) { + if (td->optimized) { + // Add individual initlocal for each IL local. These should + // all be optimized out by SSA cprop/deadce optimizations. + for (int i = 0; i < header->num_locals; i++) { + interp_add_ins (td, MINT_INITLOCAL); + int local_var = inlining ? local_locals [i] : (num_args + i); + td->last_ins->dreg = local_var; + td->last_ins->data [0] = GINT_TO_UINT16 (td->vars [local_var].size); + } + } else { + interp_add_ins (td, MINT_INITLOCALS); + td->last_ins->data [0] = GUINT_TO_UINT16 (td->il_locals_offset); + td->last_ins->data [1] = GUINT_TO_UINT16 (td->il_locals_size); + } + } + td->dont_inline = g_list_prepend (td->dont_inline, method); while (td->ip < end) { // Check here for every opcode to avoid code bloat @@ -8419,7 +8431,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in g_array_append_val (td->line_numbers, lne); } - if (opcode == MINT_NOP || opcode == MINT_DEF || opcode == MINT_DUMMY_USE) + if (opcode == MINT_NOP || opcode == MINT_DEF || opcode == MINT_DEF_ARG || opcode == MINT_DUMMY_USE) return ip; *ip++ = opcode; diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 55820e04b10321..207485b7f03015 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -229,6 +229,7 @@ typedef struct { guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations guint il_global : 1; // Args and IL locals guint renamed_ssa_fixed : 1; // If true, ext_index points to InterpRenamedVar, otherwise to InterpRenamableVar + guint def_arg : 1; // Var is a result of MINT_DEF_ARG. This var will have to be renamed back to the original arg var } InterpVar; typedef struct { From 09b629fec057a0639d26a84dbacd826d151bab0c Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 8 Nov 2023 19:38:37 +0200 Subject: [PATCH 10/45] [mono][interp] Resurrect cprop, cfold and some other small optimizations These optimizations now operate on global var defs --- src/mono/mono/mini/interp/transform-opt.c | 422 +++++++++------------- src/mono/mono/mini/interp/transform.c | 6 + src/mono/mono/mini/interp/transform.h | 13 +- 3 files changed, 183 insertions(+), 258 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index ad387058e95d39..dcd86278befe3f 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -345,7 +345,7 @@ interp_alloc_offsets (TransformData *td) while (var != -1) { if (td->vars [var].global || - !td->local_ref_count || td->local_ref_count [var] > 1 || + !td->var_values || td->var_values [var].ref_count > 1 || td->vars [var].no_call_args) { // Some vars can't be allocated on the call args stack, since the constraint is that // call args vars die after the call. This isn't necessarily true for global vars or @@ -1184,11 +1184,18 @@ revert_ssa_rename_cb (TransformData *td, int *pvar, gpointer data) if (ext_index == -1) return; + int new_var = -1; if (td->vars [var].renamed_ssa_fixed) { int renamable_var_ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; - *pvar = td->renamable_vars [renamable_var_ext_index].var_index; + new_var = td->renamable_vars [renamable_var_ext_index].var_index; } else if (td->vars [var].def_arg) { - *pvar = td->renamable_vars [ext_index].var_index; + new_var = td->renamable_vars [ext_index].var_index; + } + + if (new_var != -1) { + *pvar = new_var; + // Offset allocator checks ref_count to detect single use vars. Keep it updated + td->var_values [new_var].ref_count += td->var_values [var].ref_count; } } @@ -1793,13 +1800,12 @@ interp_get_mt_for_ldind (int ldind_op) break; static InterpInst* -interp_fold_unop (TransformData *td, InterpVarValue *local_defs, InterpInst *ins) +interp_fold_unop (TransformData *td, InterpInst *ins) { - int *local_ref_count = td->local_ref_count; // ins should be an unop, therefore it should have a single dreg and a single sreg int dreg = ins->dreg; int sreg = ins->sregs [0]; - InterpVarValue *val = &local_defs [sreg]; + InterpVarValue *val = &td->var_values [sreg]; InterpVarValue result; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8) @@ -1873,10 +1879,10 @@ interp_fold_unop (TransformData *td, InterpVarValue *local_defs, InterpInst *ins interp_dump_ins (ins, td->data_items); } - local_ref_count [sreg]--; - result.ins = ins; + td->var_values [sreg].ref_count--; + result.def = ins; result.ref_count = 0; - local_defs [dreg] = result; + td->var_values [dreg] = result; return ins; } @@ -1896,12 +1902,11 @@ interp_fold_unop (TransformData *td, InterpVarValue *local_defs, InterpInst *ins break; static InterpInst* -interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpVarValue *local_defs, InterpInst *ins) +interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpInst *ins) { - int *local_ref_count = td->local_ref_count; // ins should be an unop conditional branch, therefore it should have a single sreg int sreg = ins->sregs [0]; - InterpVarValue *val = &local_defs [sreg]; + InterpVarValue *val = &td->var_values [sreg]; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8 && val->type != VAR_VALUE_NON_NULL) return ins; @@ -1934,7 +1939,7 @@ interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpVarVal interp_dump_ins (ins, td->data_items); } - local_ref_count [sreg]--; + td->var_values [sreg].ref_count--; return ins; } @@ -1965,15 +1970,14 @@ interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpVarVal static InterpInst* -interp_fold_binop (TransformData *td, InterpVarValue *local_defs, InterpInst *ins, gboolean *folded) +interp_fold_binop (TransformData *td, InterpInst *ins, gboolean *folded) { - int *local_ref_count = td->local_ref_count; // ins should be a binop, therefore it should have a single dreg and two sregs int dreg = ins->dreg; int sreg1 = ins->sregs [0]; int sreg2 = ins->sregs [1]; - InterpVarValue *val1 = &local_defs [sreg1]; - InterpVarValue *val2 = &local_defs [sreg2]; + InterpVarValue *val1 = &td->var_values [sreg1]; + InterpVarValue *val2 = &td->var_values [sreg2]; InterpVarValue result; *folded = FALSE; @@ -2061,11 +2065,12 @@ interp_fold_binop (TransformData *td, InterpVarValue *local_defs, InterpInst *in interp_dump_ins (ins, td->data_items); } - local_ref_count [sreg1]--; - local_ref_count [sreg2]--; - result.ins = ins; + td->var_values [sreg1].ref_count--; + td->var_values [sreg2].ref_count--; + result.def = ins; result.ref_count = 0; - local_defs [dreg] = result; + td->var_values [dreg] = result; + return ins; } @@ -2088,14 +2093,13 @@ interp_fold_binop (TransformData *td, InterpVarValue *local_defs, InterpInst *in break; static InterpInst* -interp_fold_binop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpVarValue *local_defs, InterpInst *ins) +interp_fold_binop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpInst *ins) { - int *local_ref_count = td->local_ref_count; // ins should be a conditional binop, therefore it should have only two sregs int sreg1 = ins->sregs [0]; int sreg2 = ins->sregs [1]; - InterpVarValue *val1 = &local_defs [sreg1]; - InterpVarValue *val2 = &local_defs [sreg2]; + InterpVarValue *val1 = &td->var_values [sreg1]; + InterpVarValue *val2 = &td->var_values [sreg2]; if (val1->type != VAR_VALUE_I4 && val1->type != VAR_VALUE_I8) return ins; @@ -2133,8 +2137,8 @@ interp_fold_binop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpVarVa interp_dump_ins (ins, td->data_items); } - local_ref_count [sreg1]--; - local_ref_count [sreg2]--; + td->var_values [sreg1].ref_count--; + td->var_values [sreg2].ref_count--; return ins; } @@ -2154,15 +2158,13 @@ write_v128_element (gpointer v128_addr, InterpVarValue *val, int index, int el_s } static InterpInst* -interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpVarValue *local_defs, InterpInst *ins) +interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpInst *ins) { - int *local_ref_count = td->local_ref_count; - int *args = ins->info.call_info->call_args; int index = 0; int var = args [index]; while (var != -1) { - InterpVarValue *val = &local_defs [var]; + InterpVarValue *val = &td->var_values [var]; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8 && val->type != VAR_VALUE_R4) return ins; index++; @@ -2183,10 +2185,9 @@ interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpVarValu index = 0; var = args [index]; while (var != -1) { - InterpVarValue *val = &local_defs [var]; + InterpVarValue *val = &td->var_values [var]; write_v128_element (v128_addr, val, index, el_size); val->ref_count--; - local_ref_count [var]--; index++; var = args [index]; } @@ -2196,106 +2197,79 @@ interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpVarValu interp_dump_ins (ins, td->data_items); } - local_defs [dreg].ins = ins; - local_defs [dreg].type = VAR_VALUE_NONE; + td->var_values [dreg].def = ins; + td->var_values [dreg].type = VAR_VALUE_NONE; return ins; } -static void -cprop_sreg (TransformData *td, InterpInst *ins, int *psreg, InterpVarValue *local_defs) +static gboolean +can_extend_var_liveness (TransformData *td, int var, guint32 liveness) { - int *local_ref_count = td->local_ref_count; - int sreg = *psreg; + if (!td->vars [var].renamed_ssa_fixed) + return TRUE; - local_ref_count [sreg]++; - local_defs [sreg].ref_count++; - if (local_defs [sreg].type == VAR_VALUE_OTHER_VAR) { - int cprop_local = local_defs [sreg].var; + InterpRenamedFixedVar *fixed_var_ext = &td->renamed_fixed_vars [td->vars [var].ext_index]; + int cur_bb = liveness >> INTERP_LIVENESS_INS_INDEX_BITS; - // We are trying to replace sregs [i] with its def local (cprop_local), but cprop_local has since been - // modified, so we can't use it. - if (local_defs [cprop_local].ins != NULL && local_defs [cprop_local].def_index > local_defs [sreg].def_index) - return; + // If var was already live at the end of this bblocks, there is no liveness extension happening + if (fixed_var_ext->live_out_bblocks && mono_bitset_test_fast (fixed_var_ext->live_out_bblocks, cur_bb)) + return TRUE; - if (td->verbose_level) - g_print ("cprop %d -> %d:\n\t", sreg, cprop_local); - local_ref_count [sreg]--; - *psreg = cprop_local; - local_ref_count [cprop_local]++; - if (td->verbose_level) - interp_dump_ins (ins, td->data_items); - } else if (!local_defs [sreg].ins) { - td->vars [sreg].unknown_use = TRUE; + GSList *bb_liveness = fixed_var_ext->live_limit_bblocks; + while (bb_liveness) { + guint32 liveness_limit = (guint32)(gsize)bb_liveness->data; + if (cur_bb == (liveness_limit >> INTERP_LIVENESS_INS_INDEX_BITS)) { + if (liveness <= liveness_limit) + return TRUE; + else + return FALSE; + } else { + bb_liveness = bb_liveness->next; + } } -} -static void -clear_local_defs (TransformData *td, int *pvar, void *data) -{ - int var = *pvar; - InterpVarValue *local_defs = (InterpVarValue*) data; - local_defs [var].type = VAR_VALUE_NONE; - local_defs [var].ins = NULL; - local_defs [var].ref_count = 0; + return FALSE; } static void -clear_unused_defs (TransformData *td, int *pvar, void *data) +cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liveness) { int var = *pvar; - if (!td->vars [var].local_only) - return; - if (td->vars [var].indirects) - return; - InterpVarValue *local_def = &((InterpVarValue*) data) [var]; - InterpInst *def_ins = local_def->ins; - if (!def_ins) - return; - if (local_def->ref_count) + if (!var_is_ssa_form (td, var)) return; - // This is a local only var that is defined in this bblock and its value is not used - // at all in this bblock. Clear the definition - if (MINT_NO_SIDE_EFFECTS (def_ins->opcode)) { - for (int i = 0; i < mono_interp_op_sregs [def_ins->opcode]; i++) - td->local_ref_count [def_ins->sregs [i]]--; - if (td->verbose_level) { - g_print ("kill unused local def:\n\t"); - interp_dump_ins (def_ins, td->data_items); - } - interp_clear_ins (def_ins); + InterpVarValue *val = &td->var_values [var]; + g_assert (val->type >= 0 && val->type < VAR_VALUE_COUNT); + if (val->type == VAR_VALUE_OTHER_VAR && can_extend_var_liveness (td, val->var, current_liveness)) { + int cprop_var = val->var; + + if (td->verbose_level) + g_print ("cprop %d -> %d:\n\t", var, cprop_var); + InterpVarValue *cprop_val = &td->var_values [cprop_var]; + cprop_val->ref_count++; + *pvar = cprop_var; + if (td->verbose_level) + interp_dump_ins (ins, td->data_items); + } else { + val->ref_count++; } } static void interp_cprop (TransformData *td) { - InterpVarValue *local_defs = (InterpVarValue*) g_malloc (td->vars_size * sizeof (InterpVarValue)); - int *local_ref_count = (int*) g_malloc (td->vars_size * sizeof (int)); - InterpBasicBlock *bb; - gboolean needs_retry; - int ins_index; - int iteration_count = 0; - - td->local_ref_count = local_ref_count; -retry: - needs_retry = FALSE; - memset (local_ref_count, 0, td->vars_size * sizeof (int)); - if (td->verbose_level) - g_print ("\ncprop iteration %d\n", iteration_count++); - - for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - InterpInst *ins; - ins_index = 0; + g_print ("\nCPROP:\n"); - // Set cbb since we do some instruction inserting below - td->cbb = bb; + td->var_values = (InterpVarValue*) mono_mempool_alloc (td->mempool, td->vars_size * sizeof (InterpVarValue)); - for (ins = bb->first_ins; ins != NULL; ins = ins->next) - interp_foreach_ins_var (td, ins, local_defs, clear_local_defs); + // Traverse in dfs order. This guarantees that we always reach the definition first before the + // use of the var. Exception is only for phi nodes, where we don't care about the definition + // anyway. + for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count; bb_dfs_index++) { + InterpBasicBlock *bb = td->bblocks [bb_dfs_index]; if (td->verbose_level) { GString* bb_info = interp_get_bb_links (bb); @@ -2303,53 +2277,53 @@ interp_cprop (TransformData *td) g_string_free (bb_info, TRUE); } - for (ins = bb->first_ins; ins != NULL; ins = ins->next) { - int opcode = ins->opcode; + guint32 current_liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + // Set cbb since we do some instruction inserting below + td->cbb = bb; + for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER) + current_liveness++; - if (opcode == MINT_NOP) + if (interp_ins_is_nop (ins)) continue; + int opcode = ins->opcode; int num_sregs = mono_interp_op_sregs [opcode]; int num_dregs = mono_interp_op_dregs [opcode]; gint32 *sregs = &ins->sregs [0]; gint32 dreg = ins->dreg; - if (td->verbose_level && ins->opcode != MINT_NOP && ins->opcode != MINT_IL_SEQ_POINT) + if (td->verbose_level) interp_dump_ins (ins, td->data_items); - for (int i = 0; i < num_sregs; i++) { - if (sregs [i] == MINT_CALL_ARGS_SREG) { - if (ins->info.call_info && ins->info.call_info->call_args) { - int *call_args = ins->info.call_info->call_args; - while (*call_args != -1) { - cprop_sreg (td, ins, call_args, local_defs); - call_args++; + if (num_sregs) { + for (int i = 0; i < num_sregs; i++) { + if (sregs [i] == MINT_CALL_ARGS_SREG) { + if (ins->info.call_info && ins->info.call_info->call_args) { + int *call_args = ins->info.call_info->call_args; + while (*call_args != -1) { + cprop_svar (td, ins, call_args, current_liveness); + call_args++; + } } + } else { + cprop_svar (td, ins, &sregs [i], current_liveness); } - } else { - cprop_sreg (td, ins, &sregs [i], local_defs); + } + } else if (opcode == MINT_PHI) { + // no cprop but add ref counts + int *args = ins->info.args; + while (*args != -1) { + td->var_values [*args].ref_count++; + args++; } } if (num_dregs) { - // Check if the previous definition of this var was used at all. - // If it wasn't we can just clear the instruction - // - // MINT_MOV_DST_OFF doesn't fully write to the var, so we special case it here - if (local_defs [dreg].ins != NULL && - local_defs [dreg].ref_count == 0 && - !td->vars [dreg].indirects && - opcode != MINT_MOV_DST_OFF) { - InterpInst *prev_def = local_defs [dreg].ins; - if (MINT_NO_SIDE_EFFECTS (prev_def->opcode)) { - for (int i = 0; i < mono_interp_op_sregs [prev_def->opcode]; i++) - local_ref_count [prev_def->sregs [i]]--; - interp_clear_ins (prev_def); - } - } - local_defs [dreg].type = VAR_VALUE_NONE; - local_defs [dreg].ins = ins; - local_defs [dreg].def_index = ins_index; + InterpVarValue *dval = &td->var_values [dreg]; + dval->type = VAR_VALUE_NONE; + dval->def = ins; + dval->ref_count = 0; } // We always store to the full i4, except as part of STIND opcodes. These opcodes can be @@ -2365,123 +2339,87 @@ interp_cprop (TransformData *td) if (td->verbose_level) g_print ("clear redundant mov\n"); interp_clear_ins (ins); - local_ref_count [sreg]--; - } else if (td->vars [sreg].indirects || td->vars [dreg].indirects) { + td->var_values [sreg].ref_count--; + } else if (!var_is_ssa_form (td, sreg) || !var_is_ssa_form (td, dreg)) { // Don't bother with indirect locals - } else if (local_defs [sreg].type == VAR_VALUE_I4 || local_defs [sreg].type == VAR_VALUE_I8) { + } else if (td->var_values [sreg].type == VAR_VALUE_I4 || td->var_values [sreg].type == VAR_VALUE_I8) { // Replace mov with ldc - gboolean is_i4 = local_defs [sreg].type == VAR_VALUE_I4; + gboolean is_i4 = td->var_values [sreg].type == VAR_VALUE_I4; g_assert (!td->vars [sreg].indirects); - local_defs [dreg].type = local_defs [sreg].type; + td->var_values [dreg].type = td->var_values [sreg].type; if (is_i4) { - int ct = local_defs [sreg].i; + int ct = td->var_values [sreg].i; ins = interp_get_ldc_i4_from_const (td, ins, ct, dreg); - local_defs [dreg].i = ct; + td->var_values [dreg].i = ct; } else { - gint64 ct = local_defs [sreg].l; + gint64 ct = td->var_values [sreg].l; ins = interp_inst_replace_with_i8_const (td, ins, ct); - local_defs [dreg].l = ct; + td->var_values [dreg].l = ct; } - local_defs [dreg].ins = ins; - local_ref_count [sreg]--; + td->var_values [dreg].def = ins; + td->var_values [sreg].ref_count--; if (td->verbose_level) { g_print ("cprop loc %d -> ct :\n\t", sreg); interp_dump_ins (ins, td->data_items); } - } else if (local_defs [sreg].ins != NULL && - td->vars [sreg].execution_stack && - !td->vars [dreg].execution_stack && - interp_prev_ins (ins) == local_defs [sreg].ins && - !(interp_prev_ins (ins)->flags & INTERP_INST_FLAG_PROTECTED_NEWOBJ)) { - // hackish temporary optimization that won't be necessary in the future - // We replace `local1 <- ?, local2 <- local1` with `local2 <- ?, local1 <- local2` - // if local1 is execution stack local and local2 is normal global local. This makes - // it more likely for `local1 <- local2` to be killed, while before we always needed - // to store to the global local, which is likely accessed by other instructions. - InterpInst *def = local_defs [sreg].ins; - int original_dreg = def->dreg; - - def->dreg = dreg; - ins->dreg = original_dreg; - sregs [0] = dreg; - - local_defs [dreg].type = VAR_VALUE_NONE; - local_defs [dreg].ins = def; - local_defs [dreg].def_index = local_defs [original_dreg].def_index; - local_defs [dreg].ref_count++; - local_defs [original_dreg].type = VAR_VALUE_OTHER_VAR; - local_defs [original_dreg].ins = ins; - local_defs [original_dreg].var = dreg; - local_defs [original_dreg].def_index = ins_index; - local_defs [original_dreg].ref_count--; - - local_ref_count [original_dreg]--; - local_ref_count [dreg]++; - - if (td->verbose_level) { - g_print ("cprop dreg:\n\t"); - interp_dump_ins (def, td->data_items); - g_print ("\t"); - interp_dump_ins (ins, td->data_items); - } } else { if (td->verbose_level) g_print ("local copy %d <- %d\n", dreg, sreg); - local_defs [dreg].type = VAR_VALUE_OTHER_VAR; - local_defs [dreg].var = sreg; + td->var_values [dreg].type = VAR_VALUE_OTHER_VAR; + td->var_values [dreg].var = sreg; } } else if (opcode == MINT_LDLOCA_S) { // The local that we are taking the address of is not a sreg but still referenced - local_ref_count [ins->sregs [0]]++; + td->var_values [ins->sregs [0]].ref_count++; } else if (MINT_IS_LDC_I4 (opcode)) { - local_defs [dreg].type = VAR_VALUE_I4; - local_defs [dreg].i = interp_get_const_from_ldc_i4 (ins); + td->var_values [dreg].type = VAR_VALUE_I4; + td->var_values [dreg].i = interp_get_const_from_ldc_i4 (ins); } else if (MINT_IS_LDC_I8 (opcode)) { - local_defs [dreg].type = VAR_VALUE_I8; - local_defs [dreg].l = interp_get_const_from_ldc_i8 (ins); + td->var_values [dreg].type = VAR_VALUE_I8; + td->var_values [dreg].l = interp_get_const_from_ldc_i8 (ins); } else if (opcode == MINT_LDC_R4) { guint32 val_u = READ32 (&ins->data [0]); float f = *(float*)(&val_u); - local_defs [dreg].type = VAR_VALUE_R4; - local_defs [dreg].f = f; + td->var_values [dreg].type = VAR_VALUE_R4; + td->var_values [dreg].f = f; } else if (ins->opcode == MINT_LDPTR) { #if SIZEOF_VOID_P == 8 - local_defs [dreg].type = VAR_VALUE_I8; - local_defs [dreg].l = (gint64)td->data_items [ins->data [0]]; + td->var_values [dreg].type = VAR_VALUE_I8; + td->var_values [dreg].l = (gint64)td->data_items [ins->data [0]]; #else - local_defs [dreg].type = VAR_VALUE_I4; - local_defs [dreg].i = (gint32)td->data_items [ins->data [0]]; + td->var_values [dreg].type = VAR_VALUE_I4; + td->var_values [dreg].i = (gint32)td->data_items [ins->data [0]]; #endif } else if (MINT_IS_UNOP (opcode)) { - ins = interp_fold_unop (td, local_defs, ins); + ins = interp_fold_unop (td, ins); } else if (MINT_IS_UNOP_CONDITIONAL_BRANCH (opcode)) { - ins = interp_fold_unop_cond_br (td, bb, local_defs, ins); + ins = interp_fold_unop_cond_br (td, bb, ins); } else if (MINT_IS_SIMD_CREATE (opcode)) { - ins = interp_fold_simd_create (td, bb, local_defs, ins); + ins = interp_fold_simd_create (td, bb, ins); } else if (MINT_IS_BINOP (opcode)) { gboolean folded; - ins = interp_fold_binop (td, local_defs, ins, &folded); + ins = interp_fold_binop (td, ins, &folded); if (!folded) { int sreg = -1; guint16 mov_op = 0; if ((opcode == MINT_MUL_I4 || opcode == MINT_DIV_I4) && - local_defs [ins->sregs [1]].type == VAR_VALUE_I4 && - local_defs [ins->sregs [1]].i == 1) { + td->var_values [ins->sregs [1]].type == VAR_VALUE_I4 && + td->var_values [ins->sregs [1]].i == 1) { sreg = ins->sregs [0]; mov_op = MINT_MOV_4; } else if ((opcode == MINT_MUL_I8 || opcode == MINT_DIV_I8) && - local_defs [ins->sregs [1]].type == VAR_VALUE_I8 && - local_defs [ins->sregs [1]].l == 1) { + td->var_values [ins->sregs [1]].type == VAR_VALUE_I8 && + td->var_values [ins->sregs [1]].l == 1) { sreg = ins->sregs [0]; mov_op = MINT_MOV_8; } else if (opcode == MINT_MUL_I4 && - local_defs [ins->sregs [0]].type == VAR_VALUE_I4 && - local_defs [ins->sregs [0]].i == 1) { + td->var_values [ins->sregs [0]].type == VAR_VALUE_I4 && + td->var_values [ins->sregs [0]].i == 1) { sreg = ins->sregs [1]; mov_op = MINT_MOV_4; } else if (opcode == MINT_MUL_I8 && - local_defs [ins->sregs [0]].type == VAR_VALUE_I8 && - local_defs [ins->sregs [0]].l == 1) { + td->var_values [ins->sregs [0]].type == VAR_VALUE_I8 && + td->var_values [ins->sregs [0]].l == 1) { sreg = ins->sregs [1]; mov_op = MINT_MOV_8; } @@ -2492,13 +2430,12 @@ interp_cprop (TransformData *td) g_print ("Replace idempotent binop :\n\t"); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode)) { - ins = interp_fold_binop_cond_br (td, bb, local_defs, ins); + ins = interp_fold_binop_cond_br (td, bb, ins); } else if (MINT_IS_LDIND (opcode)) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; int mt = td->vars [local].mt; @@ -2516,23 +2453,22 @@ interp_cprop (TransformData *td) ins->opcode = GINT_TO_OPCODE (interp_get_mov_for_type (ldind_mt, FALSE)); break; } - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; interp_ins_set_sreg (ins, local); if (td->verbose_level) { g_print ("Replace ldloca/ldind pair :\n\t"); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } } else if (MINT_IS_LDFLD (opcode)) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int mt = ins->opcode - MINT_LDFLD_I1; int local = ldloca->sregs [0]; // Allow ldloca instruction to be killed - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; if (td->vars [local].mt == (ins->opcode - MINT_LDFLD_I1) && ins->data [0] == 0) { // Replace LDLOCA + LDFLD with LDLOC, when the loading field represents // the entire local. This is the case with loading the only field of an @@ -2557,16 +2493,16 @@ interp_cprop (TransformData *td) ins->data [2] = ldsize; interp_clear_ins (ins->prev); + td->var_values [ins->dreg].def = ins; } if (td->verbose_level) { g_print ("Replace ldloca/ldfld pair :\n\t"); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } else if (opcode == MINT_INITOBJ) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int size = ins->data [0]; int local = ldloca->sregs [0]; @@ -2577,27 +2513,25 @@ interp_cprop (TransformData *td) ins->opcode = MINT_LDC_I8_0; else ins->opcode = MINT_INITLOCAL; - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; ins->dreg = local; if (td->verbose_level) { g_print ("Replace ldloca/initobj pair :\n\t"); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } else if (opcode == MINT_LDOBJ_VT) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int ldsize = ins->data [0]; int local = ldloca->sregs [0]; - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; if (ldsize == td->vars [local].size) { // Replace LDLOCA + LDOBJ_VT with MOV_VT ins->opcode = MINT_MOV_VT; sregs [0] = local; - needs_retry = TRUE; } else { // This loads just a part of the local valuetype ins = interp_insert_ins (td, ins, MINT_MOV_SRC_OFF); @@ -2615,18 +2549,17 @@ interp_cprop (TransformData *td) } } } else if (opcode == MINT_STOBJ_VT || opcode == MINT_STOBJ_VT_NOREF) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int stsize = ins->data [0]; int local = ldloca->sregs [0]; if (stsize == td->vars [local].size) { // Replace LDLOCA + STOBJ_VT with MOV_VT - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; ins->opcode = MINT_MOV_VT; sregs [0] = sregs [1]; ins->dreg = local; - needs_retry = TRUE; if (td->verbose_level) { g_print ("Replace ldloca/stobj_vt pair :\n\t"); @@ -2635,13 +2568,13 @@ interp_cprop (TransformData *td) } } } else if (MINT_IS_STIND (opcode)) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; int mt = td->vars [local].mt; if (mt != MINT_TYPE_VT) { // We have an 8 byte local, just replace the stind with a mov - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; // We make the assumption that the STIND matches the local type ins->opcode = GINT_TO_OPCODE (interp_get_mov_for_type (mt, TRUE)); interp_ins_set_dreg (ins, local); @@ -2651,15 +2584,14 @@ interp_cprop (TransformData *td) g_print ("Replace ldloca/stind pair :\n\t"); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } } else if (MINT_IS_STFLD (opcode)) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int mt = ins->opcode - MINT_STFLD_I1; int local = ldloca->sregs [0]; - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; // Allow ldloca instruction to be killed if (td->vars [local].mt == (ins->opcode - MINT_STFLD_I1) && ins->data [0] == 0) { ins->opcode = GINT_TO_OPCODE (interp_get_mov_for_type (mt, FALSE)); @@ -2695,46 +2627,29 @@ interp_cprop (TransformData *td) g_print ("Replace ldloca/stfld pair (off %p) :\n\t", (void *)(uintptr_t) ldloca->il_offset); interp_dump_ins (ins, td->data_items); } - needs_retry = TRUE; } } else if (opcode == MINT_GETITEM_SPAN) { - InterpInst *ldloca = local_defs [sregs [0]].ins; + InterpInst *ldloca = td->var_values [sregs [0]].def; if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; // Allow ldloca instruction to be killed - local_ref_count [sregs [0]]--; + td->var_values [sregs [0]].ref_count--; // Instead of loading from the indirect pointer pass directly the vt var ins->opcode = MINT_GETITEM_LOCALSPAN; sregs [0] = local; - needs_retry = TRUE; } } else if (opcode == MINT_CKNULL) { - InterpInst *def = local_defs [sregs [0]].ins; + InterpInst *def = td->var_values [sregs [0]].def; if (def && def->opcode == MINT_LDLOCA_S) { // CKNULL on LDLOCA is a NOP ins->opcode = MINT_MOV_P; - needs_retry = TRUE; } } else if (opcode == MINT_BOX) { // TODO Add more relevant opcodes - local_defs [dreg].type = VAR_VALUE_NON_NULL; + td->var_values [dreg].type = VAR_VALUE_NON_NULL; } - - ins_index++; } - - for (ins = bb->first_ins; ins != NULL; ins = ins->next) - interp_foreach_ins_var (td, ins, local_defs, clear_unused_defs); } - - needs_retry |= interp_local_deadce (td); - if (mono_interp_opt & INTERP_OPT_BBLOCKS) - needs_retry |= interp_optimize_bblocks (td); - - if (needs_retry) - goto retry; - - g_free (local_defs); } void @@ -3296,6 +3211,9 @@ interp_optimize_code (TransformData *td) MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); + if (mono_interp_opt & INTERP_OPT_CPROP) + MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); + interp_exit_ssa (td); if (td->verbose_level) { diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 2d56e55d4025bb..45eb8c6a7082d3 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -6207,6 +6207,9 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, // First arg is dummy var, it is null when passed to the ctor call_args [0] = interp_create_var (td, get_type_from_stack (stack_type [ret_mt], NULL)); + // Make sure this arg is defined for SSA optimizations + interp_add_ins (td, MINT_DEF); + td->last_ins->dreg = call_args [0]; for (int i = 0; i < csignature->param_count; i++) { call_args [i + 1] = td->sp [i].var; } @@ -6347,6 +6350,9 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, push_type (td, stack_type [ret_mt], klass); push_type (td, stack_type [ret_mt], klass); } + // Make sure this arg is defined for SSA optimizations + interp_add_ins (td, MINT_DEF); + td->last_ins->dreg = td->sp [-1].var; int dreg = td->sp [-2].var; // Push back the params to top of stack. The original vars are maintained. diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 207485b7f03015..fbb1b5c6994b2a 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -49,11 +49,10 @@ typedef struct #define VAR_VALUE_I8 3 #define VAR_VALUE_R4 4 #define VAR_VALUE_NON_NULL 5 +#define VAR_VALUE_COUNT 6 -// LocalValue contains data to construct an InterpInst that is equivalent with the contents -// of the stack slot / local / argument. typedef struct { - // Indicates the type of the stored information. It can be another local or a constant + // Indicates the type of the stored information. It can be another var or a constant int type; // Holds the local index or the actual constant value union { @@ -63,9 +62,9 @@ typedef struct { float f; }; // The instruction that writes this local. - InterpInst *ins; - int def_index; - // ref count for ins->dreg + InterpInst *def; + // The number of times this var is referenced. After optimizations + // this can become 0, in which case we can clear the def instruction. int ref_count; } InterpVarValue; @@ -301,6 +300,8 @@ typedef struct unsigned int renamed_fixed_vars_capacity; InterpRenamedFixedVar *renamed_fixed_vars; + InterpVarValue *var_values; + int n_data_items; int max_data_items; void **data_items; From d053d1c7dec16ed0d047e6f84420e9a5255179f9 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Nov 2023 12:54:47 +0200 Subject: [PATCH 11/45] [mono][interp] Bring back deadce SSA vars that are not used can be removed, if their definition has no side effects. This becomes simplified from the old version since we only need to check if the ref_count if 0. When we kill an instruction we immediately decrease ref count of sregs and we redo deadce. We could recursively attempt to kill the defs of sregs but for now just redo deadce for simplicity. --- src/mono/mono/mini/interp/mintops.h | 2 +- src/mono/mono/mini/interp/transform-opt.c | 79 +++++++++++------------ src/mono/mono/mini/interp/transform.c | 3 +- src/mono/mono/mini/interp/transform.h | 3 - 4 files changed, 39 insertions(+), 48 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index ec8e5f050c678d..928d073eb98a7c 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -234,7 +234,7 @@ typedef enum { #define MINT_IS_RETURN(op) (((op) >= MINT_RET && (op) <= MINT_RET_U2) || (op) == MINT_RET_I4_IMM || (op) == MINT_RET_I8_IMM) // TODO Add more -#define MINT_NO_SIDE_EFFECTS(op) (MINT_IS_MOV (op) || MINT_IS_LDC_I4 (op) || MINT_IS_LDC_I8 (op) || op == MINT_LDC_R4 || op == MINT_LDC_R8 || op == MINT_LDPTR || op == MINT_BOX) +#define MINT_NO_SIDE_EFFECTS(op) (MINT_IS_MOV (op) || MINT_IS_LDC_I4 (op) || MINT_IS_LDC_I8 (op) || op == MINT_LDC_R4 || op == MINT_LDC_R8 || op == MINT_LDPTR || op == MINT_BOX || op == MINT_INITLOCAL) #define MINT_CALL_ARGS 2 #define MINT_CALL_ARGS_SREG -2 diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index dcd86278befe3f..6b3f14cb5e0dab 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1672,64 +1672,53 @@ interp_optimize_bblocks (TransformData *td) return needs_cprop; } -static gboolean -interp_local_deadce (TransformData *td) +static void +decrement_ref_count (TransformData *td, int *varp, gpointer data) { - int *local_ref_count = td->local_ref_count; - gboolean needs_dce = FALSE; - gboolean needs_cprop = FALSE; + int var = *varp; + if (!var_is_ssa_form (td, var)) + return; + td->var_values [var].ref_count--; + // FIXME we could clear recursively + if (!td->var_values [var].ref_count) + *(gboolean*)data = TRUE; +} - for (unsigned int i = 0; i < td->vars_size; i++) { - g_assert (local_ref_count [i] >= 0); - g_assert (td->vars [i].indirects >= 0); - if (td->vars [i].indirects || td->vars [i].dead) - continue; - if (!local_ref_count [i]) { - needs_dce = TRUE; - td->vars [i].dead = TRUE; - } else if (!td->vars [i].unknown_use) { - if (!td->vars [i].local_only) { - // The value of this var is not passed between multiple basic blocks - td->vars [i].local_only = TRUE; - if (td->verbose_level) - g_print ("Var %d is local only\n", i); - needs_cprop = TRUE; - } - } - td->vars [i].unknown_use = FALSE; - } +static void +interp_var_deadce (TransformData *td) +{ + gboolean need_retry; - // Return early if all locals are alive - if (!needs_dce) - return needs_cprop; +retry: + need_retry = FALSE; - // Kill instructions that don't use stack and are storing into dead locals + // Kill instructions that are storing into unreferenced vars for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { if (MINT_NO_SIDE_EFFECTS (ins->opcode) || ins->opcode == MINT_LDLOCA_S) { int dreg = ins->dreg; - if (td->vars [dreg].dead) { + if (!var_is_ssa_form (td, dreg)) + continue; + + if (!td->var_values [dreg].ref_count) { if (td->verbose_level) { g_print ("kill dead ins:\n\t"); interp_dump_ins (ins, td->data_items); } - - if (ins->opcode == MINT_LDLOCA_S) { + if (ins->opcode == MINT_LDLOCA_S) td->vars [ins->sregs [0]].indirects--; - if (!td->vars [ins->sregs [0]].indirects) { - // We can do cprop now through this local. Run cprop again. - needs_cprop = TRUE; - } - } + + interp_foreach_ins_svar (td, ins, &need_retry, decrement_ref_count); + interp_clear_ins (ins); - // FIXME This is lazy. We should update the ref count for the sregs and redo deadce. - needs_cprop = TRUE; } } } } - return needs_cprop; + + if (need_retry) + goto retry; } static InterpInst* @@ -1881,7 +1870,7 @@ interp_fold_unop (TransformData *td, InterpInst *ins) td->var_values [sreg].ref_count--; result.def = ins; - result.ref_count = 0; + result.ref_count = td->var_values [dreg].ref_count; // preserve ref count td->var_values [dreg] = result; return ins; @@ -2068,7 +2057,7 @@ interp_fold_binop (TransformData *td, InterpInst *ins, gboolean *folded) td->var_values [sreg1].ref_count--; td->var_values [sreg2].ref_count--; result.def = ins; - result.ref_count = 0; + result.ref_count = td->var_values [dreg].ref_count; // preserve ref count td->var_values [dreg] = result; return ins; @@ -2263,7 +2252,10 @@ interp_cprop (TransformData *td) if (td->verbose_level) g_print ("\nCPROP:\n"); - td->var_values = (InterpVarValue*) mono_mempool_alloc (td->mempool, td->vars_size * sizeof (InterpVarValue)); + // FIXME + // There is no need to zero, if we pay attention to phi args vars. They + // can be used before the definition. + td->var_values = (InterpVarValue*) mono_mempool_alloc0 (td->mempool, td->vars_size * sizeof (InterpVarValue)); // Traverse in dfs order. This guarantees that we always reach the definition first before the // use of the var. Exception is only for phi nodes, where we don't care about the definition @@ -2323,7 +2315,6 @@ interp_cprop (TransformData *td) InterpVarValue *dval = &td->var_values [dreg]; dval->type = VAR_VALUE_NONE; dval->def = ins; - dval->ref_count = 0; } // We always store to the full i4, except as part of STIND opcodes. These opcodes can be @@ -3214,6 +3205,8 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_CPROP) MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); + interp_var_deadce (td); + interp_exit_ssa (td); if (td->verbose_level) { diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 45eb8c6a7082d3..0d5be47fb6da6f 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1938,7 +1938,8 @@ interp_handle_intrinsics (TransformData *td, MonoMethod *target_method, MonoClas !strcmp (tm, "ClearWithReferences")) { *op = MINT_INTRINS_CLEAR_WITH_REFERENCES; } else if (in_corlib && !strcmp (klass_name_space, "System") && !strcmp (klass_name, "Marvin")) { - if (!strcmp (tm, "Block")) { + // FIXME + if (!strcmp (tm, "Block") && 0) { InterpInst *ldloca2 = td->last_ins; if (ldloca2 != NULL && ldloca2->opcode == MINT_LDLOCA_S) { InterpInst *ldloca1 = interp_prev_ins (ldloca2); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index fbb1b5c6994b2a..3747f5a78fcd82 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -217,13 +217,10 @@ typedef struct { InterpInst *def; }; - guint dead : 1; guint execution_stack : 1; guint call_args : 1; guint global : 1; guint no_call_args : 1; - guint unknown_use : 1; - guint local_only : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations guint il_global : 1; // Args and IL locals From c210c2fa31df0f51737df6b9383a273d499a35f4 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Nov 2023 13:50:20 +0200 Subject: [PATCH 12/45] [mono][interp] Resurrect super instruction pass This now operates with previously set, during cprop, global definitions and ref counts, from `InterpVarValue`. --- src/mono/mono/mini/interp/mintops.h | 2 +- src/mono/mono/mini/interp/transform-opt.c | 119 ++++++++++++---------- src/mono/mono/mini/interp/transform.c | 3 - src/mono/mono/mini/interp/transform.h | 3 - 4 files changed, 66 insertions(+), 61 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index 928d073eb98a7c..b75d321479111d 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -208,7 +208,7 @@ typedef enum { #define MINT_SWITCH_LEN(n) (4 + (n) * 2) -#define MINT_IS_NOP(op) ((op) == MINT_NOP || (op) == MINT_DEF || (op) == MINT_DUMMY_USE || (op) == MINT_IL_SEQ_POINT) +#define MINT_IS_NOP(op) ((op) == MINT_NOP || (op) == MINT_DEF || (op) == MINT_DEF_ARG || (op) == MINT_DUMMY_USE || (op) == MINT_IL_SEQ_POINT) #define MINT_IS_MOV(op) ((op) >= MINT_MOV_I4_I1 && (op) <= MINT_MOV_VT) #define MINT_IS_UNCONDITIONAL_BRANCH(op) ((op) >= MINT_BR && (op) <= MINT_CALL_HANDLER_S) #define MINT_IS_CONDITIONAL_BRANCH(op) ((op) >= MINT_BRFALSE_I4 && (op) <= MINT_BLT_UN_R8_S) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 6b3f14cb5e0dab..788b32a82b8d7b 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2652,8 +2652,12 @@ mono_test_interp_cprop (TransformData *td) static gboolean get_sreg_imm (TransformData *td, int sreg, gint16 *imm, int result_mt) { - InterpInst *def = td->vars [sreg].def; - if (def != NULL && td->local_ref_count [sreg] == 1) { + if (!var_is_ssa_form (td, sreg)) + return FALSE; + InterpVarValue *sreg_val = &td->var_values [sreg]; + InterpInst *def = sreg_val->def; + g_assert (def); + if (sreg_val->ref_count == 1) { gint64 ct; if (MINT_IS_LDC_I4 (def->opcode)) ct = interp_get_const_from_ldc_i4 (def); @@ -2764,39 +2768,32 @@ get_unop_condbr_sp (int opcode) static void interp_super_instructions (TransformData *td) { - InterpBasicBlock *bb; - int *local_ref_count = td->local_ref_count; - interp_compute_native_offset_estimates (td); // Add some actual super instructions - for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - InterpInst *ins; - int noe; + for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count; bb_dfs_index++) { + InterpBasicBlock *bb = td->bblocks [bb_dfs_index]; // Set cbb since we do some instruction inserting below td->cbb = bb; - noe = bb->native_offset_estimate; - for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + int noe = bb->native_offset_estimate; + for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { int opcode = ins->opcode; if (MINT_IS_NOP (opcode)) continue; - if (mono_interp_op_dregs [opcode] && !td->vars [ins->dreg].global) - td->vars [ins->dreg].def = ins; if (opcode == MINT_RET || (opcode >= MINT_RET_I1 && opcode <= MINT_RET_U2)) { // ldc + ret -> ret.imm int sreg = ins->sregs [0]; gint16 imm; if (get_sreg_imm (td, sreg, &imm, (opcode == MINT_RET) ? MINT_TYPE_I2 : opcode - MINT_RET_I1)) { - InterpInst *def = td->vars [sreg].def; + InterpInst *def = td->var_values [sreg].def; int ret_op = MINT_IS_LDC_I4 (def->opcode) ? MINT_RET_I4_IMM : MINT_RET_I8_IMM; InterpInst *new_inst = interp_insert_ins (td, ins, ret_op); new_inst->data [0] = imm; interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg]--; - + td->var_values [sreg].ref_count--; // 0 if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -2827,9 +2824,10 @@ interp_super_instructions (TransformData *td) new_inst->dreg = ins->dreg; new_inst->sregs [0] = sreg; new_inst->data [0] = imm; - interp_clear_ins (td->vars [sreg_imm].def); + interp_clear_ins (td->var_values [sreg_imm].def); interp_clear_ins (ins); - local_ref_count [sreg_imm]--; + td->var_values [sreg_imm].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -2845,9 +2843,10 @@ interp_super_instructions (TransformData *td) new_inst->dreg = ins->dreg; new_inst->sregs [0] = ins->sregs [0]; new_inst->data [0] = -imm; - interp_clear_ins (td->vars [sreg_imm].def); + interp_clear_ins (td->var_values [sreg_imm].def); interp_clear_ins (ins); - local_ref_count [sreg_imm]--; + td->var_values [sreg_imm].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -2855,8 +2854,8 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_MUL_I4_IMM || opcode == MINT_MUL_I8_IMM) { int sreg = ins->sregs [0]; - InterpInst *def = td->vars [sreg].def; - if (def != NULL && td->local_ref_count [sreg] == 1) { + InterpInst *def = td->var_values [sreg].def; + if (def != NULL && td->var_values [sreg].ref_count == 1) { gboolean is_i4 = opcode == MINT_MUL_I4_IMM; if ((is_i4 && def->opcode == MINT_ADD_I4_IMM) || (!is_i4 && def->opcode == MINT_ADD_I8_IMM)) { @@ -2867,7 +2866,8 @@ interp_super_instructions (TransformData *td) new_inst->data [1] = ins->data [0]; interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg]--; + td->var_values [sreg].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -2884,17 +2884,18 @@ interp_super_instructions (TransformData *td) new_inst->dreg = ins->dreg; new_inst->sregs [0] = ins->sregs [0]; new_inst->data [0] = imm; - interp_clear_ins (td->vars [sreg_imm].def); + interp_clear_ins (td->var_values [sreg_imm].def); interp_clear_ins (ins); - local_ref_count [sreg_imm]--; + td->var_values [sreg_imm].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); } } else if (opcode == MINT_SHL_I4 || opcode == MINT_SHL_I8) { int amount_var = ins->sregs [1]; - InterpInst *amount_def = td->vars [amount_var].def; - if (amount_def != NULL && td->local_ref_count [amount_var] == 1 && amount_def->opcode == MINT_AND_I4) { + InterpInst *amount_def = td->var_values [amount_var].def; + if (amount_def != NULL && td->var_values [amount_var].ref_count == 1 && amount_def->opcode == MINT_AND_I4) { int mask_var = amount_def->sregs [1]; if (get_sreg_imm (td, mask_var, &imm, MINT_TYPE_I2)) { // ldc + and + shl -> shl_and_imm @@ -2910,10 +2911,11 @@ interp_super_instructions (TransformData *td) new_inst->sregs [0] = ins->sregs [0]; new_inst->sregs [1] = amount_def->sregs [0]; - local_ref_count [amount_var]--; - local_ref_count [mask_var]--; + td->var_values [amount_var].ref_count--; // 0 + td->var_values [mask_var].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; - interp_clear_ins (td->vars [mask_var].def); + interp_clear_ins (td->var_values [mask_var].def); interp_clear_ins (amount_def); interp_clear_ins (ins); if (td->verbose_level) { @@ -2927,8 +2929,8 @@ interp_super_instructions (TransformData *td) } else if (opcode == MINT_DIV_UN_I4 || opcode == MINT_DIV_UN_I8) { // ldc + div.un -> shr.imm int sreg_imm = ins->sregs [1]; - InterpInst *def = td->vars [sreg_imm].def; - if (def != NULL && td->local_ref_count [sreg_imm] == 1) { + InterpInst *def = td->var_values [sreg_imm].def; + if (def != NULL && td->var_values [sreg_imm].ref_count == 1) { int power2 = -1; if (MINT_IS_LDC_I4 (def->opcode)) { guint32 ct = interp_get_const_from_ldc_i4 (def); @@ -2950,7 +2952,8 @@ interp_super_instructions (TransformData *td) interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg_imm]--; + td->var_values [sreg_imm].ref_count--; + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("lower div.un: "); interp_dump_ins (new_inst, td->data_items); @@ -2959,8 +2962,8 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = td->vars [sreg_base].def; - if (def != NULL && td->local_ref_count [sreg_base] == 1) { + InterpInst *def = td->var_values [sreg_base].def; + if (def != NULL && td->var_values [sreg_base].ref_count == 1) { InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { int ldind_offset_op = MINT_LDIND_OFFSET_I1 + (opcode - MINT_LDIND_I1); @@ -2978,7 +2981,8 @@ interp_super_instructions (TransformData *td) if (new_inst) { interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg_base]--; + td->var_values [sreg_base].ref_count--; + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -2987,8 +2991,8 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_OFFSET (opcode)) { int sreg_off = ins->sregs [1]; - InterpInst *def = td->vars [sreg_off].def; - if (def != NULL && td->local_ref_count [sreg_off] == 1) { + InterpInst *def = td->var_values [sreg_off].def; + if (def != NULL && td->var_values [sreg_off].ref_count == 1) { if (def->opcode == MINT_MUL_P_IMM || def->opcode == MINT_ADD_P_IMM || def->opcode == MINT_ADD_MUL_P_IMM) { int ldind_offset_op = MINT_LDIND_OFFSET_ADD_MUL_IMM_I1 + (opcode - MINT_LDIND_OFFSET_I1); InterpInst *new_inst = interp_insert_ins (td, ins, ldind_offset_op); @@ -3014,17 +3018,18 @@ interp_super_instructions (TransformData *td) interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg_off]--; + td->var_values [sreg_off].ref_count--; // 0 + td->var_values [new_inst->dreg].def = new_inst; if (td->verbose_level) { - g_print ("method %s:%s, superins: ", m_class_get_name (td->method->klass), td->method->name); + g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); } } } } else if (MINT_IS_STIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = td->vars [sreg_base].def; - if (def != NULL && td->local_ref_count [sreg_base] == 1) { + InterpInst *def = td->var_values [sreg_base].def; + if (def != NULL && td->var_values [sreg_base].ref_count == 1) { InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { int stind_offset_op = MINT_STIND_OFFSET_I1 + (opcode - MINT_STIND_I1); @@ -3042,7 +3047,7 @@ interp_super_instructions (TransformData *td) if (new_inst) { interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg_base]--; + td->var_values [sreg_base].ref_count--; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -3055,16 +3060,16 @@ interp_super_instructions (TransformData *td) // when inlining property accessors. We should have more advanced cknull removal // optimzations, so we can catch cases where instructions are not next to each other. int obj_sreg = ins->sregs [0]; - InterpInst *def = td->vars [obj_sreg].def; + InterpInst *def = td->var_values [obj_sreg].def; if (def != NULL && def->opcode == MINT_CKNULL && interp_prev_ins (ins) == def && - def->dreg == obj_sreg && local_ref_count [obj_sreg] == 1) { + def->dreg == obj_sreg && td->var_values [obj_sreg].ref_count == 1) { if (td->verbose_level) { - g_print ("remove redundant cknull (%s): ", td->method->name); + g_print ("remove redundant cknull: "); interp_dump_ins (def, td->data_items); } ins->sregs [0] = def->sregs [0]; interp_clear_ins (def); - local_ref_count [obj_sreg]--; + td->var_values [obj_sreg].ref_count--; } } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode) && interp_is_short_offset (noe, ins->info.target_bb->native_offset_estimate)) { gint16 imm; @@ -3080,9 +3085,9 @@ interp_super_instructions (TransformData *td) new_ins->sregs [0] = ins->sregs [0]; new_ins->data [0] = imm; new_ins->info.target_bb = ins->info.target_bb; - interp_clear_ins (td->vars [sreg_imm].def); + interp_clear_ins (td->var_values [sreg_imm].def); interp_clear_ins (ins); - local_ref_count [sreg_imm]--; + td->var_values [sreg_imm].ref_count--; // 0 if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_ins, td->data_items); @@ -3106,8 +3111,8 @@ interp_super_instructions (TransformData *td) if (opcode == MINT_BRFALSE_I4 || opcode == MINT_BRTRUE_I4) { gboolean negate = opcode == MINT_BRFALSE_I4; int cond_sreg = ins->sregs [0]; - InterpInst *def = td->vars [cond_sreg].def; - if (def != NULL && local_ref_count [cond_sreg] == 1) { + InterpInst *def = td->var_values [cond_sreg].def; + if (def != NULL && td->var_values [cond_sreg].ref_count == 1) { int replace_opcode = -1; switch (def->opcode) { case MINT_CEQ_I4: replace_opcode = negate ? MINT_BNE_UN_I4 : MINT_BEQ_I4; break; @@ -3141,7 +3146,7 @@ interp_super_instructions (TransformData *td) if (def->opcode != MINT_CEQ0_I4) ins->sregs [1] = def->sregs [1]; interp_clear_ins (def); - local_ref_count [cond_sreg]--; + td->var_values [cond_sreg].ref_count--; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (ins, td->data_items); @@ -3166,8 +3171,8 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_STOBJ_VT_NOREF) { int sreg_src = ins->sregs [1]; - InterpInst *def = td->vars [sreg_src].def; - if (def != NULL && interp_prev_ins (ins) == def && def->opcode == MINT_LDOBJ_VT && ins->data [0] == def->data [0] && td->local_ref_count [sreg_src] == 1) { + InterpInst *def = td->var_values [sreg_src].def; + if (def != NULL && interp_prev_ins (ins) == def && def->opcode == MINT_LDOBJ_VT && ins->data [0] == def->data [0] && td->var_values [sreg_src].ref_count == 1) { InterpInst *new_inst = interp_insert_ins (td, ins, MINT_CPOBJ_VT_NOREF); new_inst->sregs [0] = ins->sregs [0]; // dst new_inst->sregs [1] = def->sregs [0]; // src @@ -3175,7 +3180,7 @@ interp_super_instructions (TransformData *td) interp_clear_ins (def); interp_clear_ins (ins); - local_ref_count [sreg_src]--; + td->var_values [sreg_src].ref_count--; if (td->verbose_level) { g_print ("superins: "); interp_dump_ins (new_inst, td->data_items); @@ -3207,6 +3212,12 @@ interp_optimize_code (TransformData *td) interp_var_deadce (td); + // We run this after var deadce to detect more single use vars. This pass will clear + // unnecessary instruction on the fly so deadce is no longer needed to run. + if ((mono_interp_opt & INTERP_OPT_SUPER_INSTRUCTIONS) && + (mono_interp_opt & INTERP_OPT_CPROP)) + MONO_TIME_TRACK (mono_interp_stats.super_instructions_time, interp_super_instructions (td)); + interp_exit_ssa (td); if (td->verbose_level) { diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 0d5be47fb6da6f..0cffc32d89f3ba 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -435,7 +435,6 @@ interp_create_var_explicit (TransformData *td, MonoType *type, int size) local->live_start = -1; local->bb_index = -1; local->ext_index = -1; - local->def = NULL; td->vars_size++; return td->vars_size - 1; @@ -4308,7 +4307,6 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [i].il_global = TRUE; td->vars [i].indirects = 0; td->vars [i].mt = mt; - td->vars [i].def = NULL; td->vars [i].ext_index = -1; size = mono_interp_type_size (type, mt, &align); td->vars [i].size = size; @@ -4337,7 +4335,6 @@ interp_method_compute_offsets (TransformData *td, InterpMethod *imethod, MonoMet td->vars [index].il_global = TRUE; td->vars [index].indirects = 0; td->vars [index].mt = mono_mint_type (header->locals [i]); - td->vars [index].def = NULL; td->vars [index].ext_index = -1; td->vars [index].size = size; // Every local takes a MINT_STACK_SLOT_SIZE so IL locals have same behavior as execution locals diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 3747f5a78fcd82..f4d2e9b5ecc86c 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -212,9 +212,6 @@ typedef struct { // If var is INTERP_LOCAL_FLAG_CALL_ARGS, this is the call instruction using it. // Only used during var offset allocator InterpInst *call; - // For local vars, this represents the instruction declaring it. - // Only used during super instruction pass. - InterpInst *def; }; guint execution_stack : 1; From ca9094bcb4da07f5337ef7c21070b27c7010adad Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Nov 2023 14:02:11 +0200 Subject: [PATCH 13/45] [mono][interp] Redo bblock opt pass after other optimizations This will remove some now dead bblocks and merge others. If this pass leads to removal of phi nodes, then it might be useful to redo a full ssa iteration. --- src/mono/mono/mini/interp/transform-opt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 788b32a82b8d7b..8c5ab61cbbd043 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3220,6 +3220,9 @@ interp_optimize_code (TransformData *td) interp_exit_ssa (td); + if (mono_interp_opt & INTERP_OPT_BBLOCKS) + MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); + if (td->verbose_level) { g_print ("\nOptimized IR:\n"); mono_interp_print_td_code (td); From 5ab17553f80f1e05955f223540c74f88b93f7292 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Nov 2023 15:02:17 +0200 Subject: [PATCH 14/45] [mono][interp] Re-enable MINT_INTRINS_MARVIN_BLOCK intrinsic This instruction behaves as having 2 dregs and 2 sregs. This is not supported by the compilation backend. To work around this, while preserving the full set of optimizations applicable to the args, we split the instruction into 2 different instruction with separate dregs. Then we copy back each of the dregs into the original args. We then let SSA transformation and optimizations work normally. When generating final code, we expect these two instructions to still be adjacent and then we generate a single instruction, which now has to have 2 dregs and 2 sregs. The optimization passes can't optimize away both generated moves, because they don't understand that they will be merged into a single instructioni with 'simultaneous' write to both dregs. We handle this case when emitting the intrinsic, looking for a move following the intrinsic so we can patch the destination. --- src/mono/mono/mini/interp/interp-intrins.c | 6 +- src/mono/mono/mini/interp/interp-intrins.h | 2 +- src/mono/mono/mini/interp/interp.c | 9 ++- src/mono/mono/mini/interp/mintops.def | 5 +- src/mono/mono/mini/interp/transform.c | 88 +++++++++++++++++----- 5 files changed, 86 insertions(+), 24 deletions(-) diff --git a/src/mono/mono/mini/interp/interp-intrins.c b/src/mono/mono/mini/interp/interp-intrins.c index c9e7b5ba2a7e27..1489ddfa09c4a2 100644 --- a/src/mono/mono/mini/interp/interp-intrins.c +++ b/src/mono/mono/mini/interp/interp-intrins.c @@ -16,7 +16,7 @@ rotate_left (guint32 value, int offset) } void -interp_intrins_marvin_block (guint32 *pp0, guint32 *pp1) +interp_intrins_marvin_block (guint32 *pp0, guint32 *pp1, guint32 *dest0, guint32 *dest1) { // Marvin.Block guint32 p0 = *pp0; @@ -34,8 +34,8 @@ interp_intrins_marvin_block (guint32 *pp0, guint32 *pp1) p0 += p1; p1 = rotate_left (p1, 19); - *pp0 = p0; - *pp1 = p1; + *dest0 = p0; + *dest1 = p1; } guint32 diff --git a/src/mono/mono/mini/interp/interp-intrins.h b/src/mono/mono/mini/interp/interp-intrins.h index 1e3b218af927a5..98229f26d41361 100644 --- a/src/mono/mono/mini/interp/interp-intrins.h +++ b/src/mono/mono/mini/interp/interp-intrins.h @@ -124,7 +124,7 @@ interp_intrins_popcount_i8 (guint64 val) #endif void -interp_intrins_marvin_block (guint32 *pp0, guint32 *pp1); +interp_intrins_marvin_block (guint32 *pp0, guint32 *pp1, guint32 *dest0, guint32 *dest1); guint32 interp_intrins_ascii_chars_to_uppercase (guint32 val); diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 6e62cfe8844486..161af907dcd19a 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -5937,8 +5937,13 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; MINT_IN_BREAK; } MINT_IN_CASE(MINT_INTRINS_MARVIN_BLOCK) { - interp_intrins_marvin_block ((guint32*)(locals + ip [1]), (guint32*)(locals + ip [2])); - ip += 3; + guint32 *pp0 = (guint32*)(locals + ip [1]); + guint32 *pp1 = (guint32*)(locals + ip [2]); + guint32 *dest0 = (guint32*)(locals + ip [3]); + guint32 *dest1 = (guint32*)(locals + ip [4]); + + interp_intrins_marvin_block (pp0, pp1, dest0, dest1); + ip += 5; MINT_IN_BREAK; } MINT_IN_CASE(MINT_INTRINS_ASCII_CHARS_TO_UPPERCASE) { diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index f26a9ee732d1b9..f5b85001172145 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -815,7 +815,8 @@ OPDEF(MINT_INTRINS_GET_TYPE, "intrins_get_type", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_INTRINS_SPAN_CTOR, "intrins_span_ctor", 4, 1, 2, MintOpNoArgs) OPDEF(MINT_INTRINS_RUNTIMEHELPERS_OBJECT_HAS_COMPONENT_SIZE, "intrins_runtimehelpers_object_has_component_size", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_INTRINS_CLEAR_WITH_REFERENCES, "intrin_clear_with_references", 3, 0, 2, MintOpNoArgs) -OPDEF(MINT_INTRINS_MARVIN_BLOCK, "intrins_marvin_block", 3, 0, 2, MintOpNoArgs) +// This actually has 2 dregs and 2 sregs. Dregs are displayed as the metadata +OPDEF(MINT_INTRINS_MARVIN_BLOCK, "intrins_marvin_block", 5, 0, 2, MintOpTwoShorts) OPDEF(MINT_INTRINS_ASCII_CHARS_TO_UPPERCASE, "intrins_ascii_chars_to_uppercase", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_INTRINS_MEMORYMARSHAL_GETARRAYDATAREF, "intrins_memorymarshal_getarraydataref", 3, 1, 1, MintOpNoArgs) OPDEF(MINT_INTRINS_ORDINAL_IGNORE_CASE_ASCII, "intrins_ordinal_ignore_case_ascii", 4, 1, 2, MintOpNoArgs) @@ -842,6 +843,8 @@ IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortI IROPDEF(MINT_MOV_SRC_OFF, "mov.src.off", 6, 1, 1, MintOpTwoShorts) IROPDEF(MINT_MOV_DST_OFF, "mov.dst.off", 6, 1, 1, MintOpTwoShorts) IROPDEF(MINT_PHI, "phi", 2, 1, 0, MintOpNoArgs) +IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA1, "intrins_marvin_block_ssa1", 4, 1, 2, MintOpNoArgs) +IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA2, "intrins_marvin_block_ssa2", 4, 1, 2, MintOpNoArgs) #ifdef __DEFINED_IROPDEF__ #undef IROPDEF diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 0cffc32d89f3ba..d6954ef97382c7 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1937,29 +1937,46 @@ interp_handle_intrinsics (TransformData *td, MonoMethod *target_method, MonoClas !strcmp (tm, "ClearWithReferences")) { *op = MINT_INTRINS_CLEAR_WITH_REFERENCES; } else if (in_corlib && !strcmp (klass_name_space, "System") && !strcmp (klass_name, "Marvin")) { - // FIXME - if (!strcmp (tm, "Block") && 0) { + if (!strcmp (tm, "Block")) { InterpInst *ldloca2 = td->last_ins; if (ldloca2 != NULL && ldloca2->opcode == MINT_LDLOCA_S) { InterpInst *ldloca1 = interp_prev_ins (ldloca2); if (ldloca1 != NULL && ldloca1->opcode == MINT_LDLOCA_S) { - interp_add_ins (td, MINT_INTRINS_MARVIN_BLOCK); - td->last_ins->sregs [0] = ldloca1->sregs [0]; - td->last_ins->sregs [1] = ldloca2->sregs [0]; - - // This intrinsic would normally receive two local refs, however, we try optimizing - // away both ldlocas for better codegen. This means that this intrinsic will instead - // modify the values of both sregs. In order to not overcomplicate the optimization - // passes and offset allocator with support for modifiable sregs or multi dregs, we - // just redefine both sregs after the intrinsic. - interp_add_ins (td, MINT_DEF); - td->last_ins->dreg = ldloca1->sregs [0]; - interp_add_ins (td, MINT_DEF); - td->last_ins->dreg = ldloca2->sregs [0]; + int var1 = ldloca1->sregs [0]; + int var2 = ldloca2->sregs [0]; + if (!td->optimized) { + interp_add_ins (td, MINT_INTRINS_MARVIN_BLOCK); + td->last_ins->sregs [0] = var1; + td->last_ins->sregs [1] = var2; + td->last_ins->data [0] = GINT_TO_UINT16 (var1); + td->last_ins->data [1] = GINT_TO_UINT16 (var2); + } else { + // Convert this instruction to SSA form by splitting it into 2 different + // single dreg instructions. When we generate final code, we will couple them + // together. + int result1 = interp_create_var (td, m_class_get_byval_arg (mono_defaults.uint32_class)); + int result2 = interp_create_var (td, m_class_get_byval_arg (mono_defaults.uint32_class)); + interp_add_ins (td, MINT_INTRINS_MARVIN_BLOCK_SSA1); + td->last_ins->sregs [0] = var1; + td->last_ins->sregs [1] = var2; + td->last_ins->dreg = result1; + + interp_add_ins (td, MINT_INTRINS_MARVIN_BLOCK_SSA2); + td->last_ins->sregs [0] = var1; + td->last_ins->sregs [1] = var2; + td->last_ins->dreg = result2; + + interp_add_ins (td, MINT_MOV_4); + td->last_ins->sregs [0] = result1; + td->last_ins->dreg = var1; + interp_add_ins (td, MINT_MOV_4); + td->last_ins->sregs [0] = result2; + td->last_ins->dreg = var2; + } // Remove the ldlocas - td->vars [ldloca1->sregs [0]].indirects--; - td->vars [ldloca2->sregs [0]].indirects--; + td->vars [var1].indirects--; + td->vars [var2].indirects--; interp_clear_ins (ldloca1); interp_clear_ins (ldloca2); td->sp -= 2; @@ -8604,6 +8621,43 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in *ip++ = GINT_TO_UINT16 (td->param_area_offset + ins->data [0]); *ip++ = GINT_TO_UINT16 (ins->data [1]); *ip++ = GINT_TO_UINT16 (ins->data [2]); + } else if (opcode == MINT_INTRINS_MARVIN_BLOCK) { + // Generated only in unoptimized code + int var0 = ins->sregs [0]; + int var1 = ins->sregs [1]; + g_assert (var0 == ins->data [0]); + g_assert (var1 == ins->data [1]); + + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); + } else if (opcode == MINT_INTRINS_MARVIN_BLOCK_SSA1) { + int var0 = ins->sregs [0]; + int var1 = ins->sregs [1]; + g_assert (ins->next->opcode == MINT_INTRINS_MARVIN_BLOCK_SSA2); + g_assert (var0 == ins->next->sregs [0]); + g_assert (var1 == ins->next->sregs [1]); + int dvar0 = ins->dreg; + int dvar1 = ins->next->dreg; + ip [-1] = MINT_INTRINS_MARVIN_BLOCK; + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, dvar0)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, dvar1)); + + ins->next->opcode = MINT_NOP; + InterpInst *next = interp_next_ins (ins); + // We ensure that next->sregs [0] is not used again, it will no longer be set by intrinsic + if (next->opcode == MINT_MOV_4 && td->var_values && td->var_values [next->sregs [0]].ref_count == 1) { + if (next->sregs [0] == dvar0) { + ip [-2] = GINT_TO_UINT16 (get_local_offset (td, next->dreg)); + next->opcode = MINT_NOP; + } else if (next->sregs [0] == dvar1) { + ip [-1] = GINT_TO_UINT16 (get_local_offset (td, next->dreg)); + next->opcode = MINT_NOP; + } + } } else { opcode_emit: if (mono_interp_op_dregs [opcode]) From b5d3bdd5c5e553ec44be48c456fc9919cea6b164 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 9 Nov 2023 15:16:16 +0200 Subject: [PATCH 15/45] [mono][interp] Bring back reverse propagation of dreg If we have code pattern of `var1 <- def`, `var2 <- var1` we can opt to replace it with `var2 <- ?`, `var1 <- var2`. We do this in two stages, during cprop and during super instruction pass. During cprop we don't know exactly if a var will be alive or not, but we can assume that ssa fixed vars will likely end up alive. Also they have optimization constraints, so it is better, if possible, to store directly into them. So if var2 is ssa fixed, we will make the `def` store directly into it and hope var1 will end up dying. We want to do this during cprop stage so we can further propagate uses of the fixed var. We do this optimization only if both instructions are in the same bblock (we do small per-bblock liveness tracking for other ssa fixed vars related to var2, so we ensure we don't have conflicting liveness) During super instruction pass we have some additional information about vars that ended up dying, so we attempt the same optimization, this time using as a condition whether var1 has a ref count of 1, in which case we are sure we can kill it. This time we no longer care if var2 is ssa fixed, since we will always benefit. --- src/mono/mono/mini/interp/transform-opt.c | 131 ++++++++++++++++++++-- src/mono/mono/mini/interp/transform.h | 6 + 2 files changed, 127 insertions(+), 10 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 8c5ab61cbbd043..da3fdf86a95fc7 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2221,29 +2221,66 @@ can_extend_var_liveness (TransformData *td, int var, guint32 liveness) return FALSE; } +static void +replace_svar_use (TransformData *td, int *pvar, gpointer data) +{ + int *var_pair = (int*)data; + int old_var = var_pair [0]; + if (*pvar == old_var) { + int new_var = var_pair [1]; + td->var_values [old_var].ref_count--; + td->var_values [new_var].ref_count++; + *pvar = new_var; + if (td->verbose_level) + g_print ("\treplace svar use: %d -> %d\n", old_var, new_var); + } +} + +static void +replace_svar_uses (TransformData *td, InterpInst *first, InterpInst *last, int old_var, int new_var) +{ + int *var_pair = alloca (2 * sizeof (int)); + var_pair [0] = old_var; + var_pair [1] = new_var; + for (InterpInst *ins = first; ins != last; ins = ins->next) + interp_foreach_ins_svar (td, ins, var_pair, replace_svar_use); +} + static void cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liveness) { int var = *pvar; - if (!var_is_ssa_form (td, var)) return; InterpVarValue *val = &td->var_values [var]; g_assert (val->type >= 0 && val->type < VAR_VALUE_COUNT); - if (val->type == VAR_VALUE_OTHER_VAR && can_extend_var_liveness (td, val->var, current_liveness)) { + if (val->type == VAR_VALUE_OTHER_VAR) { int cprop_var = val->var; - - if (td->verbose_level) - g_print ("cprop %d -> %d:\n\t", var, cprop_var); - InterpVarValue *cprop_val = &td->var_values [cprop_var]; - cprop_val->ref_count++; - *pvar = cprop_var; - if (td->verbose_level) - interp_dump_ins (ins, td->data_items); + if (td->vars [var].renamed_ssa_fixed && !td->vars [cprop_var].renamed_ssa_fixed) { + // ssa fixed vars are likely to live, keep using them + val->ref_count++; + } else if (can_extend_var_liveness (td, cprop_var, current_liveness)) { + if (td->verbose_level) + g_print ("cprop %d -> %d:\n\t", var, cprop_var); + InterpVarValue *cprop_val = &td->var_values [cprop_var]; + cprop_val->ref_count++; + *pvar = cprop_var; + if (td->verbose_level) + interp_dump_ins (ins, td->data_items); + } else { + val->ref_count++; + } } else { val->ref_count++; } + + // Mark the last use for a renamable fixed var + var = *pvar; + if (td->vars [var].renamed_ssa_fixed) { + int ext_index = td->renamed_fixed_vars [td->vars [var].ext_index].renamable_var_ext_index; + td->renamable_vars [ext_index].last_use_liveness = current_liveness; + } } static void @@ -2315,6 +2352,7 @@ interp_cprop (TransformData *td) InterpVarValue *dval = &td->var_values [dreg]; dval->type = VAR_VALUE_NONE; dval->def = ins; + dval->liveness = current_liveness; } // We always store to the full i4, except as part of STIND opcodes. These opcodes can be @@ -2353,6 +2391,50 @@ interp_cprop (TransformData *td) g_print ("cprop loc %d -> ct :\n\t", sreg); interp_dump_ins (ins, td->data_items); } + } else if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed && + td->vars [dreg].mt == td->vars [sreg].mt && // reordering moves might break conversions + td->var_values [sreg].def->opcode != MINT_DEF_ARG && + (td->var_values [sreg].liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == bb->index) { + // dreg is a renamed ssa fixed var (likely to remain alive) and the definition of sreg + // is in this current bblock. + int last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; + if ((last_use_liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != bb->index || + td->var_values [sreg].liveness >= last_use_liveness) { + // No other conflicting renamed fixed vars are used in this bblock, or their last use + // predates the definition. This means we can tweak def of sreg to store directly + // into dreg and patch all intermediary instructions to use dreg instead. + int dreg_ref_count = td->var_values [dreg].ref_count; + td->var_values [dreg] = td->var_values [sreg]; + td->var_values [dreg].ref_count = dreg_ref_count; + td->var_values [dreg].def->dreg = dreg; + + if (td->verbose_level) { + g_print ("cprop fixed dreg %d:\n\t", dreg); + interp_dump_ins (td->var_values [dreg].def, td->data_items); + } + // Overwrite all uses of sreg with dreg up to this point + replace_svar_uses (td, td->var_values [dreg].def->next, ins, sreg, dreg); + + // Transform `mov dreg <- sreg` into `mov sreg <- dreg` in case sreg is still used + ins->dreg = sreg; + ins->sregs [0] = dreg; + td->var_values [dreg].ref_count++; + td->var_values [sreg].ref_count--; + + td->var_values [sreg].def = ins; + td->var_values [sreg].type = VAR_VALUE_OTHER_VAR; + td->var_values [sreg].var = dreg; + td->var_values [sreg].liveness = current_liveness; + if (td->verbose_level) { + g_print ("\t"); + interp_dump_ins (ins, td->data_items); + } + } else { + if (td->verbose_level) + g_print ("local copy %d <- %d\n", dreg, sreg); + td->var_values [dreg].type = VAR_VALUE_OTHER_VAR; + td->var_values [dreg].var = sreg; + } } else { if (td->verbose_level) g_print ("local copy %d <- %d\n", dreg, sreg); @@ -3186,6 +3268,35 @@ interp_super_instructions (TransformData *td) interp_dump_ins (new_inst, td->data_items); } } + } else if (opcode == MINT_MOV_4 || opcode == MINT_MOV_8 || opcode == MINT_MOV_VT) { + int sreg = ins->sregs [0]; + if (var_is_ssa_form (td, sreg) && td->var_values [sreg].ref_count == 1) { + // The svar is used only for this mov. Try to get the definition to store directly instead + InterpInst *def = td->var_values [sreg].def; + if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI) { + int dreg = ins->dreg; + // if var is not ssa or it is a renamed fixed, then we can't replace the dreg + // since there can be conflicting liveness, unless the instructions are adjacent + if ((var_is_ssa_form (td, dreg) && !td->vars [dreg].renamed_ssa_fixed) || + interp_prev_ins (ins) == def) { + def->dreg = dreg; + + // Copy var value, while keeping the ref count intact + int dreg_ref_count = td->var_values [dreg].ref_count; + td->var_values [dreg] = td->var_values [sreg]; + td->var_values [dreg].ref_count = dreg_ref_count; + + // clear the move + td->var_values [sreg].ref_count--; // 0 + interp_clear_ins (ins); + + if (td->verbose_level) { + g_print ("forward dreg: "); + interp_dump_ins (def, td->data_items); + } + } + } + } } noe += interp_get_ins_length (ins); } diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index f4d2e9b5ecc86c..6e3d7ae72faee8 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -63,6 +63,8 @@ typedef struct { }; // The instruction that writes this local. InterpInst *def; + // Liveness marker of the definition + guint32 liveness; // The number of times this var is referenced. After optimizations // this can become 0, in which case we can clear the def instruction. int ref_count; @@ -228,6 +230,10 @@ typedef struct { typedef struct { int var_index; GSList *ssa_stack; + // This liveness is bblock only. It is used during cprop to determine whether we + // can move the definition of a renamed fixed var earlier (if there are no conflicts with + // other renamed vars from the same var) + guint32 last_use_liveness; // Var that is global and might take part in phi opcodes guint ssa_global : 1; From 1eb16278b5da9186e8b6d4472d0a05b4cddc9446 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 10 Nov 2023 17:26:15 +0200 Subject: [PATCH 16/45] [mono][interp] Fix SSA form for vt field stores When storing into a vt we could end up using the instruction MINT_MOV_DST_OFF. This instruction was receiving as destination the vt and as source the value to store. For SSA this is incorrect since the new definition also depends of the old value of the vt. We fix this by adding a new svar to this opcode that holds the same var as the destination. Since following renaming and other optimizations, the sreg and dreg of this instruction can end up diverging, when we emit the final code for the instruction, we add a full valuetype copy if necessary. This commit also gets rid of MINT_NEWOBJ_VT_INLINED opcode which was silently producing a ref to a var that was not properly tracked. We replace it instead with an INITLOCAL followed by a LDLOCA, instructions that are likely to be optimized out. --- .../runtime/jiterpreter-trace-generator.ts | 12 -------- src/mono/mono/mini/interp/interp.c | 9 ------ .../mini/interp/jiterpreter-opcode-values.h | 1 - src/mono/mono/mini/interp/mintops.def | 3 +- src/mono/mono/mini/interp/transform-opt.c | 9 +++++- src/mono/mono/mini/interp/transform.c | 29 ++++++++++--------- 6 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 616792ffec25ba..f84bb6a662f4a0 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -1098,18 +1098,6 @@ export function generateWasmBody( break; } - case MintOpcode.MINT_NEWOBJ_VT_INLINED: { - const ret_size = getArgU16(ip, 3); - // memset (this_vt, 0, ret_size); - append_ldloca(builder, getArgU16(ip, 2), ret_size); - append_memset_dest(builder, 0, ret_size); - // LOCAL_VAR (ip [1], gpointer) = this_vt; - builder.local("pLocals"); - append_ldloca(builder, getArgU16(ip, 2), ret_size); - append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); - break; - } - case MintOpcode.MINT_NEWOBJ: case MintOpcode.MINT_NEWOBJ_VT: case MintOpcode.MINT_CALLVIRT_FAST: diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 161af907dcd19a..0c6da62f172d43 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -5786,15 +5786,6 @@ MINT_IN_CASE(MINT_BRTRUE_I8_SP) ZEROP_SP(gint64, !=); MINT_IN_BREAK; cmethod = (InterpMethod*)frame->imethod->data_items [imethod_index]; goto jit_call; } - MINT_IN_CASE(MINT_NEWOBJ_VT_INLINED) { - guint16 ret_size = ip [3]; - gpointer this_vt = locals + ip [2]; - - memset (this_vt, 0, ret_size); - LOCAL_VAR (ip [1], gpointer) = this_vt; - ip += 4; - MINT_IN_BREAK; - } MINT_IN_CASE(MINT_NEWOBJ_SLOW) { guint32 const token = ip [3]; return_offset = ip [1]; diff --git a/src/mono/mono/mini/interp/jiterpreter-opcode-values.h b/src/mono/mono/mini/interp/jiterpreter-opcode-values.h index d109c70ba25bd0..db88ec1dd03ca9 100644 --- a/src/mono/mono/mini/interp/jiterpreter-opcode-values.h +++ b/src/mono/mono/mini/interp/jiterpreter-opcode-values.h @@ -135,7 +135,6 @@ OP(MINT_INTRINS_RUNTIMEHELPERS_OBJECT_HAS_COMPONENT_SIZE, HIGH) OP(MINT_INTRINS_ENUM_HASFLAG, HIGH) OP(MINT_INTRINS_ORDINAL_IGNORE_CASE_ASCII, HIGH) OP(MINT_NEWOBJ_INLINED, HIGH) -OP(MINT_NEWOBJ_VT_INLINED, MASSIVE) OP(MINT_CPBLK, HIGH) OP(MINT_INITBLK, HIGH) OP(MINT_ROL_I4_IMM, HIGH) diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index f5b85001172145..db603185546320 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -363,7 +363,6 @@ OPDEF(MINT_NEWOBJ_STRING, "newobj_string", 4, 1, 1, MintOpMethodToken) OPDEF(MINT_NEWOBJ, "newobj", 5, 1, 1, MintOpMethodToken) OPDEF(MINT_NEWOBJ_INLINED, "newobj_inlined", 3, 1, 0, MintOpVTableToken) OPDEF(MINT_NEWOBJ_VT, "newobj_vt", 5, 1, 1, MintOpMethodToken) -OPDEF(MINT_NEWOBJ_VT_INLINED, "newobj_vt_inlined", 4, 1, 1, MintOpShortInt) OPDEF(MINT_INITOBJ, "initobj", 3, 0, 1, MintOpShortInt) OPDEF(MINT_CASTCLASS, "castclass", 4, 1, 1, MintOpClassToken) OPDEF(MINT_ISINST, "isinst", 4, 1, 1, MintOpClassToken) @@ -841,7 +840,7 @@ IROPDEF(MINT_DUMMY_USE, "dummy_use", 2, 0, 1, MintOpNoArgs) IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortInt) // These two opcodes are resolved to a normal MINT_MOV when emitting compacted instructions IROPDEF(MINT_MOV_SRC_OFF, "mov.src.off", 6, 1, 1, MintOpTwoShorts) -IROPDEF(MINT_MOV_DST_OFF, "mov.dst.off", 6, 1, 1, MintOpTwoShorts) +IROPDEF(MINT_MOV_DST_OFF, "mov.dst.off", 8, 1, 2, MintOpTwoShorts) IROPDEF(MINT_PHI, "phi", 2, 1, 0, MintOpNoArgs) IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA1, "intrins_marvin_block_ssa1", 4, 1, 2, MintOpNoArgs) IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA2, "intrins_marvin_block_ssa2", 4, 1, 2, MintOpNoArgs) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index da3fdf86a95fc7..dba43cfd6d0050 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2689,12 +2689,19 @@ interp_cprop (TransformData *td) // This stores just to part of the dest valuetype ins = interp_insert_ins (td, ins, MINT_MOV_DST_OFF); interp_ins_set_dreg (ins, local); - interp_ins_set_sreg (ins, sregs [1]); + interp_ins_set_sregs2 (ins, sregs [1], local); ins->data [0] = GINT_TO_UINT16 (foffset); ins->data [1] = GINT_TO_UINT16 (mt); ins->data [2] = vtsize; interp_clear_ins (ins->prev); + + // MINT_MOV_DST_OFF doesn't work if dreg is allocated at the same location as the + // field value to be stored, because its behavior is not atomic in nature. We first + // copy the original whole vt, potentially overwritting the new field value. + ins = interp_insert_ins (td, ins, MINT_DUMMY_USE); + interp_ins_set_sreg (ins, sregs [1]); + td->var_values [sregs [1]].ref_count++; } if (td->verbose_level) { g_print ("Replace ldloca/stfld pair (off %p) :\n\t", (void *)(uintptr_t) ldloca->il_offset); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index d6954ef97382c7..c6b86f6f392630 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -3042,11 +3042,9 @@ interp_inline_newobj (TransformData *td, MonoMethod *target_method, MonoMethodSi dreg = interp_create_var (td, get_type_from_stack (stack_type [ret_mt], klass)); - // For valuetypes, we need to control the lifetime of the valuetype. - // MINT_NEWOBJ_VT_INLINED takes the address of this reg and we should keep - // the vt alive until the inlining is completed. - interp_add_ins (td, MINT_DEF); + interp_add_ins (td, MINT_INITLOCAL); interp_ins_set_dreg (td->last_ins, dreg); + td->last_ins->data [0] = GINT_TO_UINT16 (vtsize); } else { dreg = interp_create_var (td, get_type_from_stack (stack_type [ret_mt], klass)); } @@ -3065,11 +3063,10 @@ interp_inline_newobj (TransformData *td, MonoMethod *target_method, MonoMethodSi td->sp += csignature->param_count; if (is_vt) { - // Receives the valuetype allocated with MINT_DEF, and returns its address - newobj_fast = interp_add_ins (td, MINT_NEWOBJ_VT_INLINED); + newobj_fast = interp_add_ins (td, MINT_LDLOCA_S); interp_ins_set_dreg (newobj_fast, this_reg); interp_ins_set_sreg (newobj_fast, dreg); - newobj_fast->data [0] = GUINTPTR_TO_UINT16 (ALIGN_TO (vtsize, MINT_STACK_SLOT_SIZE)); + td->vars [dreg].indirects++; } else { MonoVTable *vtable = mono_class_vtable_checked (klass, error); goto_if_nok (error, fail); @@ -3087,11 +3084,6 @@ interp_inline_newobj (TransformData *td, MonoMethod *target_method, MonoMethodSi if (!interp_inline_method (td, target_method, mheader, error)) goto fail; - if (is_vt) { - interp_add_ins (td, MINT_DUMMY_USE); - interp_ins_set_sreg (td->last_ins, dreg); - } - push_var (td, dreg); return TRUE; fail: @@ -8553,6 +8545,16 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in guint16 foff = ins->data [0]; guint16 mt = ins->data [1]; guint16 fsize = ins->data [2]; + ip--; + + if (opcode == MINT_MOV_DST_OFF && get_local_offset (td, ins->dreg) != get_local_offset (td, ins->sregs [1])) { + // We are no longer storing a field into the same valuetype. Copy also the whole vt. + *ip++ = MINT_MOV_VT; + + *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->dreg)); + *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [1])); + *ip++ = GINT_TO_UINT16 (td->vars [ins->dreg].size); + } int dest_off = get_local_offset (td, ins->dreg); int src_off = get_local_offset (td, ins->sregs [0]); @@ -8579,8 +8581,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in } } } - // Replace MINT_MOV_OFF with the real instruction - ip [-1] = opcode; + *ip++ = opcode; *ip++ = GINT_TO_UINT16 (dest_off); *ip++ = GINT_TO_UINT16 (src_off); if (opcode == MINT_MOV_VT) From c88eeaedcec4d536b8b9f8dc84d0de04ad1f27e1 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 13 Nov 2023 11:24:47 +0200 Subject: [PATCH 17/45] [mono][interp] Redo optimizations if a var is no longer indirect Vars that have ldloca applied to them can't take part in ssa optimizations. During these optimizations the result of the ldloca might end up dying, making the var no longer indirect. In this case, after we exit ssa, we redo the whole transformation followed by optimizations. We should also redo optimizations if a phi node ends up removed. --- src/mono/mono/mini/interp/transform-opt.c | 45 ++++++++++++++++------- src/mono/mono/mini/interp/transform.h | 1 + 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index dba43cfd6d0050..4edc6d1de965cb 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1210,6 +1210,8 @@ interp_exit_ssa (TransformData *td) ins->opcode = MINT_NOP; else interp_foreach_ins_var (td, ins, NULL, revert_ssa_rename_cb); + + ins->flags &= ~INTERP_INST_FLAG_LIVENESS_MARKER; } } @@ -1219,6 +1221,7 @@ interp_exit_ssa (TransformData *td) g_slist_free (td->vars [i].declare_bbs); td->vars [i].declare_bbs = NULL; } + td->vars [i].ext_index = -1; } for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { @@ -1226,6 +1229,11 @@ interp_exit_ssa (TransformData *td) g_slist_free (bb->dominated); bb->dominated = NULL; } + bb->dfs_index = -1; + bb->gen_set = NULL; + bb->kill_set = NULL; + bb->live_in_set = NULL; + bb->live_out_set = NULL; } for (unsigned int i = 0; i < td->renamable_vars_size; i++) { @@ -1234,6 +1242,7 @@ interp_exit_ssa (TransformData *td) td->renamable_vars [i].ssa_stack = NULL; } } + td->renamable_vars_size = 0; for (unsigned int i = 0; i < td->renamed_fixed_vars_size; i++) { if (td->renamed_fixed_vars [i].live_limit_bblocks) { @@ -1241,6 +1250,7 @@ interp_exit_ssa (TransformData *td) td->renamed_fixed_vars [i].live_limit_bblocks = NULL; } } + td->renamed_fixed_vars_size = 0; } /* @@ -1347,17 +1357,16 @@ interp_unlink_bblocks (InterpBasicBlock *from, InterpBasicBlock *to) to->in_count--; } -static gboolean +static void interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock *prev_bb) { - gboolean needs_cprop = FALSE; - for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { if (ins->opcode == MINT_LDLOCA_S) { td->vars [ins->sregs [0]].indirects--; if (!td->vars [ins->sregs [0]].indirects) { - // We can do cprop now through this local. Run cprop again. - needs_cprop = TRUE; + if (td->verbose_level) + g_print ("Remove bblock %d, var %d no longer indirect\n", bb->index, ins->sregs [0]); + td->need_ssa_retry = TRUE; } } } @@ -1367,8 +1376,6 @@ interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock interp_unlink_bblocks (bb, bb->out_bb [0]); prev_bb->next_bb = bb->next_bb; mark_bb_as_dead (td, bb, bb->next_bb); - - return needs_cprop; } void @@ -1639,11 +1646,10 @@ interp_reorder_bblocks (TransformData *td) } // Traverse the list of basic blocks and merge adjacent blocks -static gboolean +static void interp_optimize_bblocks (TransformData *td) { InterpBasicBlock *bb = td->entry_bb; - gboolean needs_cprop = FALSE; interp_reorder_bblocks (td); @@ -1656,20 +1662,18 @@ interp_optimize_bblocks (TransformData *td) if (!next_bb->reachable) { if (td->verbose_level) g_print ("Removed BB%d\n", next_bb->index); - needs_cprop |= interp_remove_bblock (td, next_bb, bb); + interp_remove_bblock (td, next_bb, bb); continue; } else if (bb->out_count == 1 && bb->out_bb [0] == next_bb && next_bb->in_count == 1 && !next_bb->eh_block && !next_bb->patchpoint_data) { g_assert (next_bb->in_bb [0] == bb); interp_merge_bblocks (td, bb, next_bb); if (td->verbose_level) g_print ("Merged BB%d and BB%d\n", bb->index, next_bb->index); - needs_cprop = TRUE; continue; } bb = next_bb; } - return needs_cprop; } static void @@ -1706,8 +1710,14 @@ interp_var_deadce (TransformData *td) g_print ("kill dead ins:\n\t"); interp_dump_ins (ins, td->data_items); } - if (ins->opcode == MINT_LDLOCA_S) + if (ins->opcode == MINT_LDLOCA_S) { td->vars [ins->sregs [0]].indirects--; + if (!td->vars [ins->sregs [0]].indirects) { + if (td->verbose_level) + g_print ("Kill ldloca, var %d no longer indirect\n", ins->sregs [0]); + td->need_ssa_retry = TRUE; + } + } interp_foreach_ins_svar (td, ins, &need_retry, decrement_ref_count); @@ -3323,6 +3333,9 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); +ssa_retry: + td->need_ssa_retry = FALSE; + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); if (mono_interp_opt & INTERP_OPT_CPROP) @@ -3341,6 +3354,12 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); + if (td->need_ssa_retry) { + if (td->verbose_level) + g_print ("Retry method %s\n", mono_method_full_name (td->method, 1)); + goto ssa_retry; + } + if (td->verbose_level) { g_print ("\nOptimized IR:\n"); mono_interp_print_td_code (td); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 6e3d7ae72faee8..42eadb62b28ca1 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -343,6 +343,7 @@ typedef struct guint optimized : 1; guint has_invalid_code : 1; guint has_inlined_one_call : 1; + guint need_ssa_retry : 1; } TransformData; #define STACK_TYPE_I4 0 From 6004987c734664883d7ed5c7f9d8b9e3d5a885d0 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 22 Nov 2023 14:03:43 +0200 Subject: [PATCH 18/45] [mono][interp] Disable ssa optimizations for methods with bad cfg structure We currently don't handle scenarios like: ``` try { throw } catch { leave exit; } exit: ``` The try block is not explicitly linked to exit, but it reaches this path by going through the exception handler. Skip SSA for code like this, since it doesn't seem worthwhile to handle. --- src/mono/mono/mini/interp/transform-opt.c | 3 +++ src/mono/mono/mini/interp/transform.c | 10 ++++++++++ src/mono/mono/mini/interp/transform.h | 1 + 3 files changed, 14 insertions(+) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 4edc6d1de965cb..0da829aefe6d8f 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3326,6 +3326,9 @@ interp_optimize_code (TransformData *td) if (td->header->num_clauses) return; + if (td->disable_ssa) + return; + // Give up on huge methods. We can easily work around this if decide to care. if (td->bb_count > ((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1)) return; diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index c6b86f6f392630..35d8911b2e9f42 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -6525,6 +6525,16 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, case CEE_THROW: if (!td->aggressive_inlining) INLINE_FAILURE; + if (!inlining) { + guint32 il_offset = GINT_TO_UINT32(td->current_il_offset); + for (unsigned int i = 0; i < td->header->num_clauses; i++) { + MonoExceptionClause *clause = &td->header->clauses [i]; + // If we throw during try and then catch we don't have the bblocks + // properly linked, just disable ssa for now + if (clause->flags == MONO_EXCEPTION_CLAUSE_NONE && (clause->try_offset <= il_offset) && (il_offset < (clause->try_offset + clause->try_len))) + td->disable_ssa = TRUE; + } + } CHECK_STACK (td, 1); interp_add_ins (td, MINT_THROW); interp_ins_set_sreg (td->last_ins, td->sp [-1].var); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 42eadb62b28ca1..610d8bfc2857eb 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -344,6 +344,7 @@ typedef struct guint has_invalid_code : 1; guint has_inlined_one_call : 1; guint need_ssa_retry : 1; + guint disable_ssa : 1; } TransformData; #define STACK_TYPE_I4 0 From 0422ae5c9ffac253ccb4607d0b0c4e16bbce92d9 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 22 Nov 2023 12:50:55 +0200 Subject: [PATCH 19/45] [mono][interp] Scan also bblocks reachable from EH The CFG is now split into the no EH paths (on this subset we run all the SSA transformations) and the rest of the bblocks reachable from EH handlers. We ensure that bblocks from no EH path are not linked to bblocks from EH handlers. Enable SSA transformation on methods with clauses. --- src/mono/mono/mini/interp/transform-opt.c | 53 ++++++++++++++++++----- src/mono/mono/mini/interp/transform.c | 9 ++-- src/mono/mono/mini/interp/transform.h | 1 + 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 0da829aefe6d8f..92dcd1802e2640 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -534,8 +534,6 @@ dfs_visit (InterpBasicBlock *bb, int *pos, InterpBasicBlock **bb_array) bb_array [dfs_index] = bb; bb->dfs_index = dfs_index; - if (dfs_index != 0) - g_assert (bb->in_count); *pos = dfs_index + 1; for (int i = 0; i < bb->out_count; i++) { InterpBasicBlock *out_bb = bb->out_bb [i]; @@ -554,12 +552,26 @@ interp_compute_dfs_indexes (TransformData *td) dfs_visit (td->entry_bb, &dfs_index, td->bblocks); td->bblocks_count = dfs_index; + // Visit also bblocks reachable from eh handlers. These bblocks are not linked + // to the main cfg (where we do dominator computation, ssa transformation etc) + for (int i = 0; i < td->header->num_clauses; i++) { + MonoExceptionClause *c = td->header->clauses + i; + InterpBasicBlock *bb = td->offset_to_bb [c->handler_offset]; + dfs_visit (bb, &dfs_index, td->bblocks); + + if (c->flags == MONO_EXCEPTION_CLAUSE_FILTER) { + bb = td->offset_to_bb [c->data.filter_offset]; + dfs_visit (bb, &dfs_index, td->bblocks); + } + } + td->bblocks_count_eh = dfs_index; + if (td->verbose_level) { InterpBasicBlock *bb; g_print ("\nBASIC BLOCK GRAPH:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { GString* bb_info = interp_get_bb_links (bb); - g_print ("BB%d: DFS(%d), %s\n", bb->index, bb->dfs_index, bb_info->str); + g_print ("BB%d: DFS%s(%d), %s\n", bb->index, (bb->dfs_index >= td->bblocks_count) ? "_EH" : "" , bb->dfs_index, bb_info->str); g_string_free (bb_info, TRUE); } } @@ -577,6 +589,18 @@ dom_intersect (InterpBasicBlock **idoms, InterpBasicBlock *bb1, InterpBasicBlock return bb1; } +static gboolean +is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) +{ + // FIXME Don't mark leave target as eh_block + // g_assert (bb->dfs_index != -1); + if (bb->dfs_index == -1) + return FALSE; + if (bb->dfs_index < td->bblocks_count) + return TRUE; + return FALSE; +} + static void interp_compute_dominators (TransformData *td) { @@ -594,7 +618,7 @@ interp_compute_dominators (TransformData *td) int j; for (j = 0; j < bb->in_count; j++) { InterpBasicBlock *in_bb = bb->in_bb [j]; - if (idoms [in_bb->dfs_index]) { + if (is_bblock_ssa_cfg (td, in_bb) && idoms [in_bb->dfs_index]) { new_idom = in_bb; break; } @@ -603,7 +627,7 @@ interp_compute_dominators (TransformData *td) // intersect new_idom with dominators from the other predecessors for (; j < bb->in_count; j++) { InterpBasicBlock *in_bb = bb->in_bb [j]; - if (idoms [in_bb->dfs_index]) + if (is_bblock_ssa_cfg (td, in_bb) && idoms [in_bb->dfs_index]) new_idom = dom_intersect (idoms, in_bb, new_idom); } @@ -629,14 +653,14 @@ interp_compute_dominators (TransformData *td) InterpBasicBlock *bb; g_print ("\nBASIC BLOCK IDOMS:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - if (bb->dfs_index == -1) + if (!is_bblock_ssa_cfg (td, bb)) continue; g_print ("IDOM (BB%d) = BB%d\n", bb->index, td->idoms [bb->dfs_index]->index); } g_print ("\nBASIC BLOCK DOMINATED:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - if (bb->dfs_index == -1) + if (!is_bblock_ssa_cfg (td, bb)) continue; if (bb->dominated) { g_print ("DOMINATED (BB%d) = {", bb->index); @@ -669,10 +693,13 @@ interp_compute_dominance_frontier (TransformData *td) if (bb->in_count > 1) { for (int j = 0; j < bb->in_count; ++j) { InterpBasicBlock *p = bb->in_bb [j]; + if (!is_bblock_ssa_cfg (td, p)) + continue; g_assert (p->dfs_index || p == td->entry_bb); while (p != td->idoms [bb->dfs_index]) { + g_assert (bb->dfs_index < td->bblocks_count); mono_bitset_set_fast (p->dfrontier, bb->dfs_index); p = td->idoms [p->dfs_index]; } @@ -684,7 +711,7 @@ interp_compute_dominance_frontier (TransformData *td) InterpBasicBlock *bb; g_print ("\nBASIC BLOCK DFRONTIERS:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - if (bb->dfs_index == -1) + if (!is_bblock_ssa_cfg (td, bb)) continue; g_print ("DFRONTIER (BB%d) = {", bb->index); int i; @@ -759,6 +786,8 @@ interp_compute_global_vars (TransformData *td) InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (!is_bblock_ssa_cfg (td, bb)) + continue; InterpInst *ins; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { interp_foreach_ins_svar (td, ins, bb, compute_global_var_cb); @@ -887,6 +916,8 @@ interp_compute_pruned_ssa_liveness (TransformData *td) g_print ("\nBASIC BLOCK LIVENESS:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { unsigned int i; + if (!is_bblock_ssa_cfg (td, bb)) + continue; g_print ("BB%d\n\tLIVE_IN = {", bb->index); mono_bitset_foreach_bit (bb->live_in_set, i, td->renamable_vars_size) { g_print (" %d", td->renamable_vars [i].var_index); @@ -950,9 +981,11 @@ insert_phi_nodes (TransformData *td) InterpBasicBlock *bb = (InterpBasicBlock*)workset->data; workset = workset->next; g_free (old_head); + g_assert (is_bblock_ssa_cfg (td, bb)); int j; mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { InterpBasicBlock *bd = td->bblocks [j]; + g_assert (is_bblock_ssa_cfg (td, bb)); if (!bb_has_phi (bd, var) && mono_bitset_test_fast (bd->live_in_set, i)) { td->renamable_vars [i].ssa_fixed = TRUE; bb_insert_phi (td, bd, var); @@ -2307,7 +2340,7 @@ interp_cprop (TransformData *td) // Traverse in dfs order. This guarantees that we always reach the definition first before the // use of the var. Exception is only for phi nodes, where we don't care about the definition // anyway. - for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count; bb_dfs_index++) { + for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count_eh; bb_dfs_index++) { InterpBasicBlock *bb = td->bblocks [bb_dfs_index]; if (td->verbose_level) { @@ -2870,7 +2903,7 @@ interp_super_instructions (TransformData *td) interp_compute_native_offset_estimates (td); // Add some actual super instructions - for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count; bb_dfs_index++) { + for (int bb_dfs_index = 0; bb_dfs_index < td->bblocks_count_eh; bb_dfs_index++) { InterpBasicBlock *bb = td->bblocks [bb_dfs_index]; // Set cbb since we do some instruction inserting below diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 35d8911b2e9f42..4747a0a82e2c86 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -729,9 +729,12 @@ handle_branch (TransformData *td, int long_op, int offset) if (offset > 0) init_bb_stack_state (td, target_bb); - if (td->cbb->no_inlining && long_op != MINT_CALL_HANDLER) - target_bb->jump_targets--; - interp_link_bblocks (td, td->cbb, target_bb); + if (long_op != MINT_CALL_HANDLER) { + if (td->cbb->no_inlining) + target_bb->jump_targets--; + // We don't link finally blocks into the cfg (or other handler blocks for that matter) + interp_link_bblocks (td, td->cbb, target_bb); + } interp_add_ins (td, long_op); td->last_ins->info.target_bb = target_bb; diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 610d8bfc2857eb..68f1399bee061c 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -319,6 +319,7 @@ typedef struct InterpBasicBlock *entry_bb, *cbb; InterpBasicBlock **bblocks; // ordering of bblocks in reverse postorder dfs int bblocks_count; + int bblocks_count_eh; InterpBasicBlock **idoms; // immediate dominator for each bblock, index from reverse postorder dfs int bb_count; MonoMemPool *mempool; From 5498668943235a9750f9193c1d845ddfcb3ce22c Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 14 Nov 2023 18:03:15 +0200 Subject: [PATCH 20/45] [mono][interp] Mark vars used inside handlers so they are not transformed to ssa The exception handlers will not be linked to the CFG so all SSA algorithms will not work on vars used there. --- src/mono/mono/mini/interp/transform-opt.c | 75 ++++++++++++++++++++--- src/mono/mono/mini/interp/transform.c | 6 ++ src/mono/mono/mini/interp/transform.h | 2 + 3 files changed, 73 insertions(+), 10 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 92dcd1802e2640..669198ef83674f 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -746,6 +746,69 @@ interp_compute_dominance (TransformData *td) /* * SSA TRANSFORMATION */ +static void +compute_eh_var_cb (TransformData *td, int *pvar, gpointer data) +{ + int var = *pvar; + td->vars [var].eh_var = TRUE; +} + +static void +interp_compute_eh_vars (TransformData *td) +{ + // EH bblocks are stored separately and are not reachable from the non-EF control flow + // path. Any var reachable from EH bblocks will not be in SSA form. + for (int i = td->bblocks_count; i < td->bblocks_count_eh; i++) { + InterpBasicBlock *bb = td->bblocks [i]; + for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (ins->opcode == MINT_LDLOCA_S) + td->vars [ins->sregs [0]].eh_var = TRUE; + interp_foreach_ins_var (td, ins, bb, compute_eh_var_cb); + } + } + + // If we have a try block that might catch exceptions, then we can't do any propagation + // of the values defined in the block since an exception could interrupt the normal control + // flow. All vars defined in this block will not be in SSA form. + for (unsigned int i = 0; i < td->header->num_clauses; i++) { + MonoExceptionClause *c = &td->header->clauses [i]; + if (c->flags == MONO_EXCEPTION_CLAUSE_NONE || + c->flags == MONO_EXCEPTION_CLAUSE_FILTER) { + InterpBasicBlock *bb = td->offset_to_bb [c->try_offset]; + int try_end = c->try_offset + c->try_len; + g_assert (bb); + while (bb->il_offset != -1 && bb->il_offset < try_end) { + for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (mono_interp_op_dregs [ins->opcode]) + td->vars [ins->dreg].eh_var = TRUE; + } + bb = bb->next_bb; + } + } + } + + td->eh_vars_computed = TRUE; +} + +static void +interp_compute_ssa_vars (TransformData *td) +{ + if (!td->eh_vars_computed) + interp_compute_eh_vars (td); + + for (unsigned int i = 0; i < td->vars_size; i++) { + if (td->vars [i].indirects > 0) { + td->vars [i].no_ssa = TRUE; + td->vars [i].has_indirects = TRUE; + } else { + td->vars [i].has_indirects = FALSE; + if (td->vars [i].eh_var) + td->vars [i].no_ssa = TRUE; + else + td->vars [i].no_ssa = FALSE; + } + } +} static gboolean var_is_ssa_form (TransformData *td, int var) @@ -777,13 +840,6 @@ compute_global_var_cb (TransformData *td, int *pvar, gpointer data) static void interp_compute_global_vars (TransformData *td) { - for (int i = 0; i < td->vars_size; i++) { - if (td->vars [i].indirects > 0) - td->vars [i].no_ssa = TRUE; - else - td->vars [i].no_ssa = FALSE; - } - InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { if (!is_bblock_ssa_cfg (td, bb)) @@ -1193,6 +1249,8 @@ interp_compute_ssa (TransformData *td) MONO_TIME_TRACK (mono_interp_stats.ssa_compute_dominance_time, interp_compute_dominance (td)); + interp_compute_ssa_vars (td); + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_global_vars_time, interp_compute_global_vars (td)); MONO_TIME_TRACK (mono_interp_stats.ssa_compute_pruned_liveness_time, interp_compute_pruned_ssa_liveness (td)); @@ -3356,9 +3414,6 @@ interp_super_instructions (TransformData *td) void interp_optimize_code (TransformData *td) { - if (td->header->num_clauses) - return; - if (td->disable_ssa) return; diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 4747a0a82e2c86..6f8eb40cdd8b41 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -1082,6 +1082,7 @@ store_local (TransformData *td, int local) { int mt = td->vars [local].mt; CHECK_STACK_RET_VOID (td, 1); + #if SIZEOF_VOID_P == 8 // nint and int32 can be used interchangeably. Add implicit conversions. if (td->sp [-1].type == STACK_TYPE_I4 && stack_type [mt] == STACK_TYPE_I8) @@ -2817,6 +2818,11 @@ interp_method_check_inlining (TransformData *td, MonoMethod *method, MonoMethodS if (td->cbb->no_inlining) return FALSE; + // Exception handlers are always uncommon, with the exception of finally. + int inner_clause = td->clause_indexes [td->current_il_offset]; + if (inner_clause != -1 && td->header->clauses [inner_clause].flags != MONO_EXCEPTION_CLAUSE_FINALLY) + return FALSE; + if (method->flags & METHOD_ATTRIBUTE_REQSECOBJ) /* Used to mark methods containing StackCrawlMark locals */ return FALSE; diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 68f1399bee061c..d6e2a4f12cb837 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -221,6 +221,7 @@ typedef struct { guint global : 1; guint no_call_args : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now + guint eh_var : 1; // This var is used inside a clause handler. It will not be in ssa form. guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations guint il_global : 1; // Args and IL locals guint renamed_ssa_fixed : 1; // If true, ext_index points to InterpRenamedVar, otherwise to InterpRenamableVar @@ -346,6 +347,7 @@ typedef struct guint has_inlined_one_call : 1; guint need_ssa_retry : 1; guint disable_ssa : 1; + guint eh_vars_computed : 1; } TransformData; #define STACK_TYPE_I4 0 From df43e38e10385fdd69bfd882fb69ebc9273a3b05 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 27 Nov 2023 16:35:54 +0200 Subject: [PATCH 21/45] [mono][interp] Bring back optimizations for variables that are not in SSA form Before the SSA redesign we were doing value/copy propagation for all variables (except vars that had indirects) within a basic block. With SSA we can do such propagation that is not limited to a single basic block boundary. The problem is that SSA transformation might encounter some limitations and variables might not be in SSA form (such an example is locals referenced from exception handlers). Before this commit, these vars would not be in SSA form and therefore they would be ignored by optimizations. We aim to avoid any sort of regression of code quality. The way we generalize handling of both SSA vars and non-SSA vars during optimization passes, is by querying for the var value. For SSA vars, this is guaranteed to be set since we traverse the CFG in DFS order, so we always reach the dominating definition first. For non-SSA vars, we also set the defintion when the var is written to (overwritting it when the var is redefined) and this definition information can be used in the same way by the optimization passes. The only difference is that, while SSA definitions were unique, the non-SSA definitions aren't, so we can only make use of them if the liveness information of the definition matches the current bblock. We only prevent reording stores to such variables and we only do value tracking within a single basic block, as we were doing before the SSA redesign. --- src/mono/mono/mini/interp/transform-opt.c | 237 +++++++++++++++------- src/mono/mono/mini/interp/transform.h | 3 +- 2 files changed, 168 insertions(+), 72 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 669198ef83674f..7ce04f4b7fd9b2 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -819,6 +819,51 @@ var_is_ssa_form (TransformData *td, int var) return TRUE; } +static gboolean +var_has_indirects (TransformData *td, int var) +{ + if (td->vars [var].has_indirects) + return TRUE; + + return FALSE; +} + +static InterpVarValue* +get_var_value (TransformData *td, int var) +{ + if (var_is_ssa_form (td, var)) + return &td->var_values [var]; + + if (var_has_indirects (td, var)) + return NULL; + + // No ssa var, check if we have a def set for the current bblock + if (td->var_values [var].def) { + if ((td->var_values [var].liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == td->cbb->index) + return &td->var_values [var]; + } + return NULL; + +} + +static InterpInst* +get_var_value_def (TransformData *td, int var) +{ + InterpVarValue *val = get_var_value (td, var); + if (val) + return val->def; + return NULL; +} + +static int +get_var_value_type (TransformData *td, int var) +{ + InterpVarValue *val = get_var_value (td, var); + if (val) + return val->type; + return VAR_VALUE_NONE; +} + static void compute_global_var_cb (TransformData *td, int *pvar, gpointer data) { @@ -1771,8 +1816,6 @@ static void decrement_ref_count (TransformData *td, int *varp, gpointer data) { int var = *varp; - if (!var_is_ssa_form (td, var)) - return; td->var_values [var].ref_count--; // FIXME we could clear recursively if (!td->var_values [var].ref_count) @@ -1793,7 +1836,7 @@ interp_var_deadce (TransformData *td) if (MINT_NO_SIDE_EFFECTS (ins->opcode) || ins->opcode == MINT_LDLOCA_S) { int dreg = ins->dreg; - if (!var_is_ssa_form (td, dreg)) + if (var_has_indirects (td, dreg)) continue; if (!td->var_values [dreg].ref_count) { @@ -1895,9 +1938,11 @@ interp_fold_unop (TransformData *td, InterpInst *ins) // ins should be an unop, therefore it should have a single dreg and a single sreg int dreg = ins->dreg; int sreg = ins->sregs [0]; - InterpVarValue *val = &td->var_values [sreg]; + InterpVarValue *val = get_var_value (td, sreg); InterpVarValue result; + if (!val) + return ins; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8) return ins; @@ -1996,8 +2041,10 @@ interp_fold_unop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpInst * { // ins should be an unop conditional branch, therefore it should have a single sreg int sreg = ins->sregs [0]; - InterpVarValue *val = &td->var_values [sreg]; + InterpVarValue *val = get_var_value (td, sreg); + if (!val) + return ins; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8 && val->type != VAR_VALUE_NON_NULL) return ins; @@ -2066,12 +2113,14 @@ interp_fold_binop (TransformData *td, InterpInst *ins, gboolean *folded) int dreg = ins->dreg; int sreg1 = ins->sregs [0]; int sreg2 = ins->sregs [1]; - InterpVarValue *val1 = &td->var_values [sreg1]; - InterpVarValue *val2 = &td->var_values [sreg2]; + InterpVarValue *val1 = get_var_value (td, sreg1); + InterpVarValue *val2 = get_var_value (td, sreg2); InterpVarValue result; *folded = FALSE; + if (!val1 || !val2) + return ins; if (val1->type != VAR_VALUE_I4 && val1->type != VAR_VALUE_I8) return ins; if (val2->type != VAR_VALUE_I4 && val2->type != VAR_VALUE_I8) @@ -2188,9 +2237,11 @@ interp_fold_binop_cond_br (TransformData *td, InterpBasicBlock *cbb, InterpInst // ins should be a conditional binop, therefore it should have only two sregs int sreg1 = ins->sregs [0]; int sreg2 = ins->sregs [1]; - InterpVarValue *val1 = &td->var_values [sreg1]; - InterpVarValue *val2 = &td->var_values [sreg2]; + InterpVarValue *val1 = get_var_value (td, sreg1); + InterpVarValue *val2 = get_var_value (td, sreg2); + if (!val1 || !val2) + return ins; if (val1->type != VAR_VALUE_I4 && val1->type != VAR_VALUE_I8) return ins; if (val2->type != VAR_VALUE_I4 && val2->type != VAR_VALUE_I8) @@ -2254,7 +2305,9 @@ interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpInst *i int index = 0; int var = args [index]; while (var != -1) { - InterpVarValue *val = &td->var_values [var]; + InterpVarValue *val = get_var_value (td, var); + if (!val) + return ins; if (val->type != VAR_VALUE_I4 && val->type != VAR_VALUE_I8 && val->type != VAR_VALUE_R4) return ins; index++; @@ -2351,29 +2404,58 @@ static void cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liveness) { int var = *pvar; - if (!var_is_ssa_form (td, var)) + if (var_has_indirects (td, var)) return; - InterpVarValue *val = &td->var_values [var]; - g_assert (val->type >= 0 && val->type < VAR_VALUE_COUNT); - if (val->type == VAR_VALUE_OTHER_VAR) { + InterpVarValue *val = get_var_value (td, var); + if (val && val->type == VAR_VALUE_OTHER_VAR) { + // var <- cprop_var; + // .... + // use var; int cprop_var = val->var; if (td->vars [var].renamed_ssa_fixed && !td->vars [cprop_var].renamed_ssa_fixed) { // ssa fixed vars are likely to live, keep using them val->ref_count++; - } else if (can_extend_var_liveness (td, cprop_var, current_liveness)) { - if (td->verbose_level) - g_print ("cprop %d -> %d:\n\t", var, cprop_var); - InterpVarValue *cprop_val = &td->var_values [cprop_var]; - cprop_val->ref_count++; - *pvar = cprop_var; - if (td->verbose_level) - interp_dump_ins (ins, td->data_items); } else { - val->ref_count++; + gboolean can_cprop = FALSE; + // If var is fixed ssa, we can extend liveness if it doesn't overlap with other renamed + // vars. If the var is not ssa, we do cprop only within the same bblock. + if (var_is_ssa_form (td, cprop_var)) { + can_cprop = can_extend_var_liveness (td, cprop_var, current_liveness); + } else { + InterpVarValue *cprop_var_val = get_var_value (td, cprop_var); + gboolean var_def_in_cur_bb = (val->liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == td->cbb->index; + if (!var_def_in_cur_bb) { + // var definition was not in current bblock so it might no longer contain + // the current value of cprop_var because cprop_var is not in ssa form and + // we don't keep track its value over multiple basic blocks + can_cprop = FALSE; + } else if (!cprop_var_val) { + // Previously in this bblock, var is recorded as having the value of cprop_var and + // cprop_var is not defined in the current bblock. This means that var will still + // contain the value of cprop_var + can_cprop = TRUE; + } else { + // Previously in this bblock, var is recorded as having the value of cprop_var and + // cprop_var is defined in the current bblock. This means that var will contain the + // value of cprop_var only if last known cprop_var redefinition was before the var definition. + can_cprop = cprop_var_val->liveness < val->liveness; + } + } + + if (can_cprop) { + if (td->verbose_level) + g_print ("cprop %d -> %d:\n\t", var, cprop_var); + td->var_values [cprop_var].ref_count++; + *pvar = cprop_var; + if (td->verbose_level) + interp_dump_ins (ins, td->data_items); + } else { + val->ref_count++; + } } } else { - val->ref_count++; + td->var_values [var].ref_count++; } // Mark the last use for a renamable fixed var @@ -2458,7 +2540,7 @@ interp_cprop (TransformData *td) // We always store to the full i4, except as part of STIND opcodes. These opcodes can be // applied to a local var only if that var has LDLOCA applied to it - if ((opcode >= MINT_MOV_I4_I1 && opcode <= MINT_MOV_I4_U2) && !td->vars [sregs [0]].indirects) { + if ((opcode >= MINT_MOV_I4_I1 && opcode <= MINT_MOV_I4_U2) && !var_has_indirects (td, sregs [0])) { ins->opcode = MINT_MOV_4; opcode = MINT_MOV_4; } @@ -2470,12 +2552,11 @@ interp_cprop (TransformData *td) g_print ("clear redundant mov\n"); interp_clear_ins (ins); td->var_values [sreg].ref_count--; - } else if (!var_is_ssa_form (td, sreg) || !var_is_ssa_form (td, dreg)) { + } else if (var_has_indirects (td, sreg) || var_has_indirects (td, dreg)) { // Don't bother with indirect locals - } else if (td->var_values [sreg].type == VAR_VALUE_I4 || td->var_values [sreg].type == VAR_VALUE_I8) { + } else if (get_var_value_type (td, sreg) == VAR_VALUE_I4 || get_var_value_type (td, sreg) == VAR_VALUE_I8) { // Replace mov with ldc gboolean is_i4 = td->var_values [sreg].type == VAR_VALUE_I4; - g_assert (!td->vars [sreg].indirects); td->var_values [dreg].type = td->var_values [sreg].type; if (is_i4) { int ct = td->var_values [sreg].i; @@ -2493,6 +2574,7 @@ interp_cprop (TransformData *td) interp_dump_ins (ins, td->data_items); } } else if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed && + var_is_ssa_form (td, sreg) && td->vars [dreg].mt == td->vars [sreg].mt && // reordering moves might break conversions td->var_values [sreg].def->opcode != MINT_DEF_ARG && (td->var_values [sreg].liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == bb->index) { @@ -2576,26 +2658,32 @@ interp_cprop (TransformData *td) if (!folded) { int sreg = -1; guint16 mov_op = 0; - if ((opcode == MINT_MUL_I4 || opcode == MINT_DIV_I4) && - td->var_values [ins->sregs [1]].type == VAR_VALUE_I4 && - td->var_values [ins->sregs [1]].i == 1) { - sreg = ins->sregs [0]; - mov_op = MINT_MOV_4; - } else if ((opcode == MINT_MUL_I8 || opcode == MINT_DIV_I8) && - td->var_values [ins->sregs [1]].type == VAR_VALUE_I8 && - td->var_values [ins->sregs [1]].l == 1) { - sreg = ins->sregs [0]; - mov_op = MINT_MOV_8; - } else if (opcode == MINT_MUL_I4 && - td->var_values [ins->sregs [0]].type == VAR_VALUE_I4 && - td->var_values [ins->sregs [0]].i == 1) { - sreg = ins->sregs [1]; - mov_op = MINT_MOV_4; - } else if (opcode == MINT_MUL_I8 && - td->var_values [ins->sregs [0]].type == VAR_VALUE_I8 && - td->var_values [ins->sregs [0]].l == 1) { - sreg = ins->sregs [1]; - mov_op = MINT_MOV_8; + InterpVarValue *vv0 = get_var_value (td, ins->sregs [0]); + InterpVarValue *vv1 = get_var_value (td, ins->sregs [1]); + if (vv1) { + if ((opcode == MINT_MUL_I4 || opcode == MINT_DIV_I4) && + vv1->type == VAR_VALUE_I4 && + vv1->i == 1) { + sreg = ins->sregs [0]; + mov_op = MINT_MOV_4; + } else if ((opcode == MINT_MUL_I8 || opcode == MINT_DIV_I8) && + vv1->type == VAR_VALUE_I8 && + vv1->l == 1) { + sreg = ins->sregs [0]; + mov_op = MINT_MOV_8; + } + } else if (vv0) { + if (opcode == MINT_MUL_I4 && + vv0->type == VAR_VALUE_I4 && + vv0->i == 1) { + sreg = ins->sregs [1]; + mov_op = MINT_MOV_4; + } else if (opcode == MINT_MUL_I8 && + vv0->type == VAR_VALUE_I8 && + vv0->l == 1) { + sreg = ins->sregs [1]; + mov_op = MINT_MOV_8; + } } if (sreg != -1) { ins->opcode = mov_op; @@ -2609,7 +2697,7 @@ interp_cprop (TransformData *td) } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode)) { ins = interp_fold_binop_cond_br (td, bb, ins); } else if (MINT_IS_LDIND (opcode)) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; int mt = td->vars [local].mt; @@ -2637,7 +2725,7 @@ interp_cprop (TransformData *td) } } } else if (MINT_IS_LDFLD (opcode)) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int mt = ins->opcode - MINT_LDFLD_I1; int local = ldloca->sregs [0]; @@ -2676,7 +2764,7 @@ interp_cprop (TransformData *td) } } } else if (opcode == MINT_INITOBJ) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int size = ins->data [0]; int local = ldloca->sregs [0]; @@ -2696,7 +2784,7 @@ interp_cprop (TransformData *td) } } } else if (opcode == MINT_LDOBJ_VT) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int ldsize = ins->data [0]; int local = ldloca->sregs [0]; @@ -2723,7 +2811,7 @@ interp_cprop (TransformData *td) } } } else if (opcode == MINT_STOBJ_VT || opcode == MINT_STOBJ_VT_NOREF) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int stsize = ins->data [0]; int local = ldloca->sregs [0]; @@ -2742,7 +2830,7 @@ interp_cprop (TransformData *td) } } } else if (MINT_IS_STIND (opcode)) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; int mt = td->vars [local].mt; @@ -2761,7 +2849,7 @@ interp_cprop (TransformData *td) } } } else if (MINT_IS_STFLD (opcode)) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int mt = ins->opcode - MINT_STFLD_I1; int local = ldloca->sregs [0]; @@ -2810,7 +2898,7 @@ interp_cprop (TransformData *td) } } } else if (opcode == MINT_GETITEM_SPAN) { - InterpInst *ldloca = td->var_values [sregs [0]].def; + InterpInst *ldloca = get_var_value_def (td, sregs [0]); if (ldloca != NULL && ldloca->opcode == MINT_LDLOCA_S) { int local = ldloca->sregs [0]; // Allow ldloca instruction to be killed @@ -2820,7 +2908,7 @@ interp_cprop (TransformData *td) sregs [0] = local; } } else if (opcode == MINT_CKNULL) { - InterpInst *def = td->var_values [sregs [0]].def; + InterpInst *def = get_var_value_def (td, sregs [0]); if (def && def->opcode == MINT_LDLOCA_S) { // CKNULL on LDLOCA is a NOP ins->opcode = MINT_MOV_P; @@ -2842,11 +2930,12 @@ mono_test_interp_cprop (TransformData *td) static gboolean get_sreg_imm (TransformData *td, int sreg, gint16 *imm, int result_mt) { - if (!var_is_ssa_form (td, sreg)) + if (var_has_indirects (td, sreg)) + return FALSE; + InterpInst *def = get_var_value_def (td, sreg); + if (!def) return FALSE; InterpVarValue *sreg_val = &td->var_values [sreg]; - InterpInst *def = sreg_val->def; - g_assert (def); if (sreg_val->ref_count == 1) { gint64 ct; if (MINT_IS_LDC_I4 (def->opcode)) @@ -2972,6 +3061,12 @@ interp_super_instructions (TransformData *td) if (MINT_IS_NOP (opcode)) continue; + if (mono_interp_op_dregs [opcode] && !var_is_ssa_form (td, ins->dreg) && !var_has_indirects (td, ins->dreg)) { + InterpVarValue *dval = &td->var_values [ins->dreg]; + dval->type = VAR_VALUE_NONE; + dval->def = ins; + dval->liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + } if (opcode == MINT_RET || (opcode >= MINT_RET_I1 && opcode <= MINT_RET_U2)) { // ldc + ret -> ret.imm int sreg = ins->sregs [0]; @@ -3044,7 +3139,7 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_MUL_I4_IMM || opcode == MINT_MUL_I8_IMM) { int sreg = ins->sregs [0]; - InterpInst *def = td->var_values [sreg].def; + InterpInst *def = get_var_value_def (td, sreg); if (def != NULL && td->var_values [sreg].ref_count == 1) { gboolean is_i4 = opcode == MINT_MUL_I4_IMM; if ((is_i4 && def->opcode == MINT_ADD_I4_IMM) || @@ -3084,7 +3179,7 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_SHL_I4 || opcode == MINT_SHL_I8) { int amount_var = ins->sregs [1]; - InterpInst *amount_def = td->var_values [amount_var].def; + InterpInst *amount_def = get_var_value_def (td, amount_var); if (amount_def != NULL && td->var_values [amount_var].ref_count == 1 && amount_def->opcode == MINT_AND_I4) { int mask_var = amount_def->sregs [1]; if (get_sreg_imm (td, mask_var, &imm, MINT_TYPE_I2)) { @@ -3119,7 +3214,7 @@ interp_super_instructions (TransformData *td) } else if (opcode == MINT_DIV_UN_I4 || opcode == MINT_DIV_UN_I8) { // ldc + div.un -> shr.imm int sreg_imm = ins->sregs [1]; - InterpInst *def = td->var_values [sreg_imm].def; + InterpInst *def = get_var_value_def (td, sreg_imm); if (def != NULL && td->var_values [sreg_imm].ref_count == 1) { int power2 = -1; if (MINT_IS_LDC_I4 (def->opcode)) { @@ -3152,7 +3247,7 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = td->var_values [sreg_base].def; + InterpInst *def = get_var_value_def (td, sreg_base); if (def != NULL && td->var_values [sreg_base].ref_count == 1) { InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { @@ -3181,7 +3276,7 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_OFFSET (opcode)) { int sreg_off = ins->sregs [1]; - InterpInst *def = td->var_values [sreg_off].def; + InterpInst *def = get_var_value_def (td, sreg_off); if (def != NULL && td->var_values [sreg_off].ref_count == 1) { if (def->opcode == MINT_MUL_P_IMM || def->opcode == MINT_ADD_P_IMM || def->opcode == MINT_ADD_MUL_P_IMM) { int ldind_offset_op = MINT_LDIND_OFFSET_ADD_MUL_IMM_I1 + (opcode - MINT_LDIND_OFFSET_I1); @@ -3218,7 +3313,7 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_STIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = td->var_values [sreg_base].def; + InterpInst *def = get_var_value_def (td, sreg_base); if (def != NULL && td->var_values [sreg_base].ref_count == 1) { InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { @@ -3250,7 +3345,7 @@ interp_super_instructions (TransformData *td) // when inlining property accessors. We should have more advanced cknull removal // optimzations, so we can catch cases where instructions are not next to each other. int obj_sreg = ins->sregs [0]; - InterpInst *def = td->var_values [obj_sreg].def; + InterpInst *def = get_var_value_def (td, obj_sreg); if (def != NULL && def->opcode == MINT_CKNULL && interp_prev_ins (ins) == def && def->dreg == obj_sreg && td->var_values [obj_sreg].ref_count == 1) { if (td->verbose_level) { @@ -3301,7 +3396,7 @@ interp_super_instructions (TransformData *td) if (opcode == MINT_BRFALSE_I4 || opcode == MINT_BRTRUE_I4) { gboolean negate = opcode == MINT_BRFALSE_I4; int cond_sreg = ins->sregs [0]; - InterpInst *def = td->var_values [cond_sreg].def; + InterpInst *def = get_var_value_def (td, cond_sreg); if (def != NULL && td->var_values [cond_sreg].ref_count == 1) { int replace_opcode = -1; switch (def->opcode) { @@ -3361,7 +3456,7 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_STOBJ_VT_NOREF) { int sreg_src = ins->sregs [1]; - InterpInst *def = td->var_values [sreg_src].def; + InterpInst *def = get_var_value_def (td, sreg_src); if (def != NULL && interp_prev_ins (ins) == def && def->opcode == MINT_LDOBJ_VT && ins->data [0] == def->data [0] && td->var_values [sreg_src].ref_count == 1) { InterpInst *new_inst = interp_insert_ins (td, ins, MINT_CPOBJ_VT_NOREF); new_inst->sregs [0] = ins->sregs [0]; // dst @@ -3378,9 +3473,9 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_MOV_4 || opcode == MINT_MOV_8 || opcode == MINT_MOV_VT) { int sreg = ins->sregs [0]; - if (var_is_ssa_form (td, sreg) && td->var_values [sreg].ref_count == 1) { + InterpInst *def = get_var_value_def (td, sreg); + if (def && td->var_values [sreg].ref_count == 1) { // The svar is used only for this mov. Try to get the definition to store directly instead - InterpInst *def = td->var_values [sreg].def; if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI) { int dreg = ins->dreg; // if var is not ssa or it is a renamed fixed, then we can't replace the dreg diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index d6e2a4f12cb837..01793bc90d35b7 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -222,7 +222,8 @@ typedef struct { guint no_call_args : 1; guint simd : 1; // We use this flag to avoid addition of align field in InterpVar, for now guint eh_var : 1; // This var is used inside a clause handler. It will not be in ssa form. - guint no_ssa : 1; // Var is not in ssa form, not subject to optimizations + guint no_ssa : 1; // Var is not in ssa form, not subject to all optimizations + guint has_indirects : 1; // Var had ldloca applied to it, not subject to optimizations guint il_global : 1; // Args and IL locals guint renamed_ssa_fixed : 1; // If true, ext_index points to InterpRenamedVar, otherwise to InterpRenamableVar guint def_arg : 1; // Var is a result of MINT_DEF_ARG. This var will have to be renamed back to the original arg var From b312bb85deebcb67faacb0165dbbba1b1d6edefc Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 29 Nov 2023 13:52:30 +0200 Subject: [PATCH 22/45] [mono][interp] Proper support for ssa disabled Given we will always end up with vars not in ssa form that we will still run optimizations on, we could just extend this to the whole method, not having any var in ssa form. This mode is equivalent with what we had before the SSA support, with all optimizations only operating on a basic block level. Instead of not running optimizations on methods with unsupported control flow, we can just avoid SSA transformations instead and still run code optimization. Add option to fully disable SSA as a configuration. --- src/mono/mono/mini/interp/interp.c | 2 + src/mono/mono/mini/interp/interp.h | 3 +- src/mono/mono/mini/interp/transform-opt.c | 49 +++++++++++++++++------ src/mono/mono/mini/interp/transform.h | 2 +- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index 0c6da62f172d43..eba7d1c0f57113 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -7927,6 +7927,8 @@ interp_parse_options (const char *options) else if (strncmp (arg, "jiterp", 6) == 0) opt = INTERP_OPT_JITERPRETER; #endif + else if (strncmp (arg, "ssa", 3) == 0) + opt = INTERP_OPT_SSA; else if (strncmp (arg, "all", 3) == 0) opt = ~INTERP_OPT_NONE; diff --git a/src/mono/mono/mini/interp/interp.h b/src/mono/mono/mini/interp/interp.h index 137b40e1dd49ec..742e93bf06e59a 100644 --- a/src/mono/mono/mini/interp/interp.h +++ b/src/mono/mono/mini/interp/interp.h @@ -41,7 +41,8 @@ enum { #if HOST_BROWSER INTERP_OPT_JITERPRETER = 64, #endif - INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD + INTERP_OPT_SSA = 128, + INTERP_OPT_DEFAULT = INTERP_OPT_INLINE | INTERP_OPT_CPROP | INTERP_OPT_SUPER_INSTRUCTIONS | INTERP_OPT_BBLOCKS | INTERP_OPT_TIERING | INTERP_OPT_SIMD | INTERP_OPT_SSA #if HOST_BROWSER | INTERP_OPT_JITERPRETER #endif diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 7ce04f4b7fd9b2..e62469c832b376 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1502,7 +1502,7 @@ interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock if (!td->vars [ins->sregs [0]].indirects) { if (td->verbose_level) g_print ("Remove bblock %d, var %d no longer indirect\n", bb->index, ins->sregs [0]); - td->need_ssa_retry = TRUE; + td->need_optimization_retry = TRUE; } } } @@ -1849,7 +1849,7 @@ interp_var_deadce (TransformData *td) if (!td->vars [ins->sregs [0]].indirects) { if (td->verbose_level) g_print ("Kill ldloca, var %d no longer indirect\n", ins->sregs [0]); - td->need_ssa_retry = TRUE; + td->need_optimization_retry = TRUE; } } @@ -2493,7 +2493,8 @@ interp_cprop (TransformData *td) // Set cbb since we do some instruction inserting below td->cbb = bb; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { - if (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER) + // LIVENESS_MARKER is set only for non-eh bblocks + if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) current_liveness++; if (interp_ins_is_nop (ins)) @@ -3506,12 +3507,29 @@ interp_super_instructions (TransformData *td) } } +static void +interp_prepare_no_ssa_opt (TransformData *td) +{ + for (unsigned int i = 0; i < td->vars_size; i++) { + td->vars [i].no_ssa = TRUE; + td->vars [i].has_indirects = (td->vars [i].indirects > 0) ? TRUE : FALSE; + } + + if (!td->bblocks) + td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + + int i = 0; + for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + td->bblocks [i] = bb; + i++; + } + td->bblocks_count = 0; + td->bblocks_count_eh = i; +} + void interp_optimize_code (TransformData *td) { - if (td->disable_ssa) - return; - // Give up on huge methods. We can easily work around this if decide to care. if (td->bb_count > ((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1)) return; @@ -3519,10 +3537,16 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); -ssa_retry: - td->need_ssa_retry = FALSE; + if (!(mono_interp_opt & INTERP_OPT_SSA)) + td->disable_ssa = TRUE; + +optimization_retry: + td->need_optimization_retry = FALSE; - MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); + if (td->disable_ssa) + interp_prepare_no_ssa_opt (td); + else + MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); if (mono_interp_opt & INTERP_OPT_CPROP) MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); @@ -3535,15 +3559,16 @@ interp_optimize_code (TransformData *td) (mono_interp_opt & INTERP_OPT_CPROP)) MONO_TIME_TRACK (mono_interp_stats.super_instructions_time, interp_super_instructions (td)); - interp_exit_ssa (td); + if (!td->disable_ssa) + interp_exit_ssa (td); if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); - if (td->need_ssa_retry) { + if (td->need_optimization_retry) { if (td->verbose_level) g_print ("Retry method %s\n", mono_method_full_name (td->method, 1)); - goto ssa_retry; + goto optimization_retry; } if (td->verbose_level) { diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 01793bc90d35b7..88352a457e93e6 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -346,7 +346,7 @@ typedef struct guint optimized : 1; guint has_invalid_code : 1; guint has_inlined_one_call : 1; - guint need_ssa_retry : 1; + guint need_optimization_retry : 1; guint disable_ssa : 1; guint eh_vars_computed : 1; } TransformData; From c30c8d5f5bd8c0a824906d965a2071a68e1e3cdd Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 1 Dec 2023 19:10:33 +0200 Subject: [PATCH 23/45] [mono][interp] Enable cprop dreg optimization for no-ssa vars This works in a limited fashion, only if the definition and the move are adjacent, as it was working before the SSA change. This is intended to improve codegen quality inside finally blocks. --- src/mono/mono/mini/interp/transform-opt.c | 112 +++++++++++++--------- 1 file changed, 69 insertions(+), 43 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index e62469c832b376..666c8bca7f837f 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2466,6 +2466,50 @@ cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liven } } +static gboolean +can_cprop_dreg (TransformData *td, InterpInst *mov_ins) +{ + int dreg = mov_ins->dreg; + int sreg = mov_ins->sregs [0]; + + // sreg = def + // mov sreg -> dreg + + InterpVarValue *sreg_val = get_var_value (td, sreg); + if (!sreg_val) + return FALSE; + // We only apply this optimization if the definition is in the same bblock as this use + if ((sreg_val->liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != td->cbb->index) + return FALSE; + if (td->var_values [sreg].def->opcode == MINT_DEF_ARG) + return FALSE; + // reordering moves might break conversions + if (td->vars [dreg].mt != td->vars [sreg].mt) + return FALSE; + + if (var_is_ssa_form (td, sreg)) { + // check if dreg is a renamed ssa fixed var (likely to remain alive) + if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed) { + int last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; + if ((last_use_liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != td->cbb->index || + sreg_val->liveness >= last_use_liveness) { + // No other conflicting renamed fixed vars (of dreg) are used in this bblock, or their + // last use predates the definition. This means we can tweak def of sreg to store directly + // into dreg and patch all intermediary instructions to use dreg instead. + return TRUE; + } + } + } else if (!var_is_ssa_form (td, dreg)) { + // Neither sreg nor dreg are in SSA form. IL globals are likely to remain alive + // We ensure that stores to no SSA vars, that are il globals, are not reordered. + // For simplicity, we apply the optimization only if the def and move are adjacent. + if (td->vars [dreg].il_global && !td->vars [sreg].il_global && mov_ins == interp_next_ins (sreg_val->def)) + return TRUE; + } + + return FALSE; +} + static void interp_cprop (TransformData *td) { @@ -2574,50 +2618,32 @@ interp_cprop (TransformData *td) g_print ("cprop loc %d -> ct :\n\t", sreg); interp_dump_ins (ins, td->data_items); } - } else if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed && - var_is_ssa_form (td, sreg) && - td->vars [dreg].mt == td->vars [sreg].mt && // reordering moves might break conversions - td->var_values [sreg].def->opcode != MINT_DEF_ARG && - (td->var_values [sreg].liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == bb->index) { - // dreg is a renamed ssa fixed var (likely to remain alive) and the definition of sreg - // is in this current bblock. - int last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; - if ((last_use_liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != bb->index || - td->var_values [sreg].liveness >= last_use_liveness) { - // No other conflicting renamed fixed vars are used in this bblock, or their last use - // predates the definition. This means we can tweak def of sreg to store directly - // into dreg and patch all intermediary instructions to use dreg instead. - int dreg_ref_count = td->var_values [dreg].ref_count; - td->var_values [dreg] = td->var_values [sreg]; - td->var_values [dreg].ref_count = dreg_ref_count; - td->var_values [dreg].def->dreg = dreg; + } else if (can_cprop_dreg (td, ins)) { + int dreg_ref_count = td->var_values [dreg].ref_count; + td->var_values [dreg] = td->var_values [sreg]; + td->var_values [dreg].ref_count = dreg_ref_count; + td->var_values [dreg].def->dreg = dreg; - if (td->verbose_level) { - g_print ("cprop fixed dreg %d:\n\t", dreg); - interp_dump_ins (td->var_values [dreg].def, td->data_items); - } - // Overwrite all uses of sreg with dreg up to this point - replace_svar_uses (td, td->var_values [dreg].def->next, ins, sreg, dreg); - - // Transform `mov dreg <- sreg` into `mov sreg <- dreg` in case sreg is still used - ins->dreg = sreg; - ins->sregs [0] = dreg; - td->var_values [dreg].ref_count++; - td->var_values [sreg].ref_count--; - - td->var_values [sreg].def = ins; - td->var_values [sreg].type = VAR_VALUE_OTHER_VAR; - td->var_values [sreg].var = dreg; - td->var_values [sreg].liveness = current_liveness; - if (td->verbose_level) { - g_print ("\t"); - interp_dump_ins (ins, td->data_items); - } - } else { - if (td->verbose_level) - g_print ("local copy %d <- %d\n", dreg, sreg); - td->var_values [dreg].type = VAR_VALUE_OTHER_VAR; - td->var_values [dreg].var = sreg; + if (td->verbose_level) { + g_print ("cprop fixed dreg %d:\n\t", dreg); + interp_dump_ins (td->var_values [dreg].def, td->data_items); + } + // Overwrite all uses of sreg with dreg up to this point + replace_svar_uses (td, td->var_values [dreg].def->next, ins, sreg, dreg); + + // Transform `mov dreg <- sreg` into `mov sreg <- dreg` in case sreg is still used + ins->dreg = sreg; + ins->sregs [0] = dreg; + td->var_values [dreg].ref_count++; + td->var_values [sreg].ref_count--; + + td->var_values [sreg].def = ins; + td->var_values [sreg].type = VAR_VALUE_OTHER_VAR; + td->var_values [sreg].var = dreg; + td->var_values [sreg].liveness = current_liveness; + if (td->verbose_level) { + g_print ("\t"); + interp_dump_ins (ins, td->data_items); } } else { if (td->verbose_level) From c9d086f0f538bb952a08905d78473c726ac0bfdb Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 5 Dec 2023 13:39:42 +0200 Subject: [PATCH 24/45] [mono][interp] Fix handling of BBs with patchpoint data during optimization Basic blocks that have a patchpoint data need to remain alive in optimized method if the associated patchpoint can be reachable from unoptimized method, so the tiering mechanism can resolve the new ip offset. We were handling this in a hack fashion, by forcefully keeping these bblocks alive, even if they weren't reachable in the optimized CFG. This was disturbing for the SSA transformation algorithms. We stop hardcoding this during dead bblock detection and instead avoid bblock reordering optimizations if they can end up killing bblocks with patchpoint data. Turns out this scenario was very rare anyway. --- src/mono/mono/mini/interp/transform-opt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 666c8bca7f837f..9df3111acb11d4 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1563,7 +1563,7 @@ interp_mark_reachable_bblocks (TransformData *td) // FIXME There is no need to force eh bblocks to remain alive current = td->entry_bb; while (current != NULL) { - if (current->eh_block || current->patchpoint_data) { + if (current->eh_block) { queue [next_position++] = current; current->reachable = TRUE; } else { @@ -1678,6 +1678,12 @@ interp_reorder_bblocks (TransformData *td) for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { if (bb->eh_block) continue; + // We do optimizations below where we reduce the in count of bb, but it is ideal to have + // this bblock remain alive so we can correctly resolve mapping from unoptimized method. + // We could in theory address this and attempt to remove bb, but this scenario is extremely + // rare and doesn't seem worth the investment. + if (bb->patchpoint_data) + continue; InterpInst *first = interp_first_ins (bb); if (!first) continue; From 4ccb8be1373fc1a6469eb1b20422b2f0ce894d95 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 6 Dec 2023 12:51:58 +0200 Subject: [PATCH 25/45] [mono][interp] Fix diverging var offsets between tiered and untiered methods Patchpoints introduce some complications since some variables have to be accessed from same offset between unoptimized and optimized methods. Consider the following scenario BB0 -> BB2 BB0: TMP <- def; IL_VAR <- TMP | ^ _ BB1: Use IL_VAR v | /| BB2: Use IL_VAR BB1 BB1 is a basic block containing a patchpoint, BB0 dominates both BB1 and BB2. IL_VAR is used both in BB1 and BB2. In BB1, in optimized code, we could normally replace use of IL_VAR with use of TMP. However, this is incorrect, because TMP can be allocated at a different offset from IL_VAR and, if we enter the method from the patchpoint in BB1, the data at var TMP would not be initialized since we only copy the IL var space. Even if we prevent the copy propagation in BB1, then tiering is still broken. In BB2 we could replace use of IL_VAR with TMP, and we end up hitting the same problem. Optimized code will attempt to access value of IL_VAR from the offset of TMP_VAR, which is not initialized if we enter from the patchpoint in BB1. We solve these issues by inserting a MINT_DEF_TIER_VAR in BB1. This instruction prevents cprop of the IL_VAR in the patchpoint bblock since MINT_DEF_TIER_VAR is seen as a redefinition. In addition to that, in BB2 we now have 2 reaching definitions for IL_VAR, the original one from BB0 and the one from patchpoint bblock from BB1. This will force a phi definition in BB2 and we will once again be forced to access IL_VAR from the original offset that is equal to the one in unoptimized method. --- src/mono/mono/mini/interp/mintops.def | 1 + src/mono/mono/mini/interp/transform-opt.c | 55 ++++++++++++++++++++--- src/mono/mono/mini/interp/transform.c | 2 +- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index db603185546320..41da680ee47ba8 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -835,6 +835,7 @@ OPDEF(MINT_TIER_MONITOR_JITERPRETER, "tier_monitor_jiterpreter", 4, 0, 0, MintOp IROPDEF(MINT_NOP, "nop", 1, 0, 0, MintOpNoArgs) IROPDEF(MINT_DEF, "def", 2, 1, 0, MintOpNoArgs) IROPDEF(MINT_DEF_ARG, "def_arg", 2, 1, 0, MintOpNoArgs) +IROPDEF(MINT_DEF_TIER_VAR, "def_tier_var", 3, 1, 1, MintOpNoArgs) IROPDEF(MINT_IL_SEQ_POINT, "il_seq_point", 1, 0, 0, MintOpNoArgs) IROPDEF(MINT_DUMMY_USE, "dummy_use", 2, 0, 1, MintOpNoArgs) IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortInt) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 9df3111acb11d4..e28df00c797e39 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1100,17 +1100,56 @@ insert_phi_nodes (TransformData *td) // Additional fixed vars, in addition to vars that are args to phi nodes static void -compute_fixed_vars (TransformData *td) +insert_tiering_defs (TransformData *td) { for (int i = 0; i < td->bblocks_count; i++) { InterpBasicBlock *bb = td->bblocks [i]; if (!bb->patchpoint_bb) continue; + // All IL locals live at entry to this bb have to be fixed for (unsigned int k = 0; k < td->renamable_vars_size; k++) { int var_index = td->renamable_vars [k].var_index; - if (td->vars [var_index].il_global && mono_bitset_test_fast (bb->live_in_set, k)) + if (td->vars [var_index].il_global && mono_bitset_test_fast (bb->live_in_set, k)) { td->renamable_vars [k].ssa_fixed = TRUE; + + // Patchpoints introduce some complications since some variables have to be + // accessed from same offset between unoptimized and optimized methods. + // + // Consider the following scenario + // BB0 -> BB2 BB0: TMP <- def; IL_VAR <- TMP + // | ^ BB1: Use IL_VAR + // v | BB2: Use IL_VAR + // BB1 + // + // BB1 is a basic block containing a patchpoint, BB0 dominates both BB1 and BB2. + // IL_VAR is used both in BB1 and BB2. In BB1, in optimized code, we could normally + // replace use of IL_VAR with use of TMP. However, this is incorrect, because TMP + // can be allocated at a different offset from IL_VAR and, if we enter the method + // from the patchpoint in BB1, the data at var TMP would not be initialized since + // we only copy the IL var space. + // Even if we prevent the copy propagation in BB1, then tiering is still broken. + // In BB2 we could replace use of IL_VAR with TMP, and we end up hitting the same problem. + // Optimized code will attempt to access value of IL_VAR from the offset of TMP_VAR, + // which is not initialized if we enter from the patchpoint in BB1. + // We solve these issues by inserting a MINT_DEF_TIER_VAR in BB1. This instruction + // prevents cprop of the IL_VAR in the patchpoint bblock since MINT_DEF_TIER_VAR is seen + // as a redefinition. In addition to that, in BB2 we now have 2 reaching definitions for + // IL_VAR, the original one from BB0 and the one from patchpoint bblock from BB1. This + // will force a phi definition in BB2 and we will once again be force to access IL_VAR + // from the original offset that is equal to the one in unoptimized method. + InterpInst *def = interp_insert_ins_bb (td, bb, NULL, MINT_DEF_TIER_VAR); + def->sregs [0] = var_index; + def->dreg = var_index; + InterpVar *var_data = &td->vars [var_index]; + // Record the new declaration for this var. Phi nodes insertion phase will account for this + if (!g_slist_find (var_data->declare_bbs, bb)) + var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); + if (td->verbose_level) { + g_print ("insert patchpoint var define in BB%d:\n\t", bb->index); + interp_dump_ins (def, td->data_items); + } + } } } } @@ -1300,9 +1339,9 @@ interp_compute_ssa (TransformData *td) MONO_TIME_TRACK (mono_interp_stats.ssa_compute_pruned_liveness_time, interp_compute_pruned_ssa_liveness (td)); - insert_phi_nodes (td); + insert_tiering_defs (td); - compute_fixed_vars (td); + insert_phi_nodes (td); MONO_TIME_TRACK (mono_interp_stats.ssa_rename_vars_time, rename_vars (td)); @@ -2559,7 +2598,11 @@ interp_cprop (TransformData *td) if (td->verbose_level) interp_dump_ins (ins, td->data_items); - if (num_sregs) { + if (opcode == MINT_DEF_TIER_VAR) { + // We can't do any var propagation into this instruction since it will be deleted + // dreg and sreg should always be identical, a ssa fixed var. + td->var_values [sregs [0]].ref_count++; + } else if (num_sregs) { for (int i = 0; i < num_sregs; i++) { if (sregs [i] == MINT_CALL_ARGS_SREG) { if (ins->info.call_info && ins->info.call_info->call_args) { @@ -3509,7 +3552,7 @@ interp_super_instructions (TransformData *td) InterpInst *def = get_var_value_def (td, sreg); if (def && td->var_values [sreg].ref_count == 1) { // The svar is used only for this mov. Try to get the definition to store directly instead - if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI) { + if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI && def->opcode != MINT_DEF_TIER_VAR) { int dreg = ins->dreg; // if var is not ssa or it is a renamed fixed, then we can't replace the dreg // since there can be conflicting liveness, unless the instructions are adjacent diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 6f8eb40cdd8b41..a646a0307420fd 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8463,7 +8463,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in g_array_append_val (td->line_numbers, lne); } - if (opcode == MINT_NOP || opcode == MINT_DEF || opcode == MINT_DEF_ARG || opcode == MINT_DUMMY_USE) + if (opcode == MINT_NOP || opcode == MINT_DEF || opcode == MINT_DEF_TIER_VAR || opcode == MINT_DEF_ARG || opcode == MINT_DUMMY_USE) return ip; *ip++ = opcode; From 82e51243f647b16e6ebfb1503290cbb82a5fcdb4 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 13 Dec 2023 12:11:32 +0200 Subject: [PATCH 26/45] [mono][interp] Retry instruction during cprop when replacing with MOV Consider the example of: x = ldloca a; y = cknull x; z = ldind y. The cknull instruction was replaced with: y <- x but we didn't handle the cprop implication of this. We now redo the cprop checks on the replaced instruction. This will mark y as a copy of x so `z = ldind y` gets replaced with `z = ldind x`, enabling us to detect that ldind is applied to a ldloca def and optimize out all instructions. --- src/mono/mono/mini/interp/transform-opt.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index e28df00c797e39..5d4fb25bdbd154 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2582,6 +2582,9 @@ interp_cprop (TransformData *td) // Set cbb since we do some instruction inserting below td->cbb = bb; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { + int opcode, num_sregs, num_dregs; + gint32 *sregs; + gint32 dreg; // LIVENESS_MARKER is set only for non-eh bblocks if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) current_liveness++; @@ -2589,11 +2592,12 @@ interp_cprop (TransformData *td) if (interp_ins_is_nop (ins)) continue; - int opcode = ins->opcode; - int num_sregs = mono_interp_op_sregs [opcode]; - int num_dregs = mono_interp_op_dregs [opcode]; - gint32 *sregs = &ins->sregs [0]; - gint32 dreg = ins->dreg; +retry_instruction: + opcode = ins->opcode; + num_sregs = mono_interp_op_sregs [opcode]; + num_dregs = mono_interp_op_dregs [opcode]; + sregs = &ins->sregs [0]; + dreg = ins->dreg; if (td->verbose_level) interp_dump_ins (ins, td->data_items); @@ -2762,12 +2766,15 @@ interp_cprop (TransformData *td) } } if (sreg != -1) { + td->var_values [ins->sregs [0]].ref_count--; + td->var_values [ins->sregs [1]].ref_count--; ins->opcode = mov_op; ins->sregs [0] = sreg; if (td->verbose_level) { g_print ("Replace idempotent binop :\n\t"); interp_dump_ins (ins, td->data_items); } + goto retry_instruction; } } } else if (MINT_IS_BINOP_CONDITIONAL_BRANCH (opcode)) { @@ -2988,6 +2995,8 @@ interp_cprop (TransformData *td) if (def && def->opcode == MINT_LDLOCA_S) { // CKNULL on LDLOCA is a NOP ins->opcode = MINT_MOV_P; + td->var_values [ins->sregs [0]].ref_count--; + goto retry_instruction; } } else if (opcode == MINT_BOX) { // TODO Add more relevant opcodes From 0c576b835b4f1b61a28c09c235962c0059f0678c Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 13 Dec 2023 17:18:42 +0200 Subject: [PATCH 27/45] [mono][interp] Rename get_local_offset to get_var_offset --- src/mono/mono/mini/interp/transform.c | 55 +++++++++++++-------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index a646a0307420fd..9a9187e97ccfc4 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8424,10 +8424,10 @@ get_short_brop (int opcode) } static int -get_local_offset (TransformData *td, int local) +get_var_offset (TransformData *td, int var) { - if (td->vars [local].offset != -1) - return td->vars [local].offset; + if (td->vars [var].offset != -1) + return td->vars [var].offset; // FIXME Some vars might end up with unitialized offset because they are not declared at all in the code. // This can happen if the bblock declaring the var gets removed, while other unreachable bblocks, that access @@ -8439,10 +8439,10 @@ get_local_offset (TransformData *td, int local) // If we use the optimized offset allocator, all locals should have had their offsets already allocated g_assert (!td->optimized); // The only remaining locals to allocate are the ones from the execution stack - g_assert (td->vars [local].execution_stack); + g_assert (td->vars [var].execution_stack); - td->vars [local].offset = td->total_locals_size + td->vars [local].stack_offset; - return td->vars [local].offset; + td->vars [var].offset = td->total_locals_size + td->vars [var].stack_offset; + return td->vars [var].offset; } static guint16* @@ -8469,7 +8469,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in *ip++ = opcode; if (opcode == MINT_SWITCH) { int labels = READ32 (&ins->data [0]); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [0])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->sregs [0])); // Write number of switch labels *ip++ = ins->data [0]; *ip++ = ins->data [1]; @@ -8497,7 +8497,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in const int br_offset = GPTRDIFF_TO_INT (start_ip - td->new_code); gboolean has_imm = opcode >= MINT_BEQ_I4_IMM_SP && opcode <= MINT_BLT_UN_I8_IMM_SP; for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [i])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->sregs [i])); if (has_imm) *ip++ = ins->data [0]; @@ -8566,17 +8566,16 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in guint16 fsize = ins->data [2]; ip--; - if (opcode == MINT_MOV_DST_OFF && get_local_offset (td, ins->dreg) != get_local_offset (td, ins->sregs [1])) { + if (opcode == MINT_MOV_DST_OFF && get_var_offset (td, ins->dreg) != get_var_offset (td, ins->sregs [1])) { // We are no longer storing a field into the same valuetype. Copy also the whole vt. *ip++ = MINT_MOV_VT; - - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->dreg)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [1])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->dreg)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->sregs [1])); *ip++ = GINT_TO_UINT16 (td->vars [ins->dreg].size); } - int dest_off = get_local_offset (td, ins->dreg); - int src_off = get_local_offset (td, ins->sregs [0]); + int dest_off = get_var_offset (td, ins->dreg); + int src_off = get_var_offset (td, ins->sregs [0]); if (opcode == MINT_MOV_SRC_OFF) src_off += foff; else @@ -8634,7 +8633,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in // actually vars. Resolve their offset int num_vars = mono_interp_oplen [opcode] - 1; for (int i = 0; i < num_vars; i++) - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->data [i])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->data [i])); } else if (opcode == MINT_MOV_STACK_UNOPT) { g_assert (!td->optimized); // ins->data [0] represents the stack offset of the call args (within the execution stack) @@ -8648,10 +8647,10 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in g_assert (var0 == ins->data [0]); g_assert (var1 == ins->data [1]); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var1)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var1)); } else if (opcode == MINT_INTRINS_MARVIN_BLOCK_SSA1) { int var0 = ins->sregs [0]; int var1 = ins->sregs [1]; @@ -8661,27 +8660,27 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in int dvar0 = ins->dreg; int dvar1 = ins->next->dreg; ip [-1] = MINT_INTRINS_MARVIN_BLOCK; - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var0)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, var1)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, dvar0)); - *ip++ = GINT_TO_UINT16 (get_local_offset (td, dvar1)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var0)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, var1)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, dvar0)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, dvar1)); ins->next->opcode = MINT_NOP; InterpInst *next = interp_next_ins (ins); // We ensure that next->sregs [0] is not used again, it will no longer be set by intrinsic if (next->opcode == MINT_MOV_4 && td->var_values && td->var_values [next->sregs [0]].ref_count == 1) { if (next->sregs [0] == dvar0) { - ip [-2] = GINT_TO_UINT16 (get_local_offset (td, next->dreg)); + ip [-2] = GINT_TO_UINT16 (get_var_offset (td, next->dreg)); next->opcode = MINT_NOP; } else if (next->sregs [0] == dvar1) { - ip [-1] = GINT_TO_UINT16 (get_local_offset (td, next->dreg)); + ip [-1] = GINT_TO_UINT16 (get_var_offset (td, next->dreg)); next->opcode = MINT_NOP; } } } else { opcode_emit: if (mono_interp_op_dregs [opcode]) - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->dreg)); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->dreg)); if (mono_interp_op_sregs [opcode]) { for (int i = 0; i < mono_interp_op_sregs [opcode]; i++) { @@ -8689,12 +8688,12 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in int offset = td->param_area_offset + ins->info.call_info->call_offset; *ip++ = GINT_TO_UINT16 (offset); } else { - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [i])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->sregs [i])); } } } else if (opcode == MINT_LDLOCA_S) { // This opcode receives a local but it is not viewed as a sreg since we don't load the value - *ip++ = GINT_TO_UINT16 (get_local_offset (td, ins->sregs [0])); + *ip++ = GINT_TO_UINT16 (get_var_offset (td, ins->sregs [0])); } int left = interp_get_ins_length (ins) - GPTRDIFF_TO_INT(ip - start_ip); From 5d4737385874392c4db588ab26fadacf4a581b37 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 13 Dec 2023 17:20:26 +0200 Subject: [PATCH 28/45] [mono][interp] Squash multiple INITLOCAL into single INITLOCALS In old non-ssa implementation, we were initializing the entire IL locals space with INITLOCALS. With SSA redesign, we generate explicit INITLOCAL for each IL local so we can have proper definitions for vars and optimized these unused defs out. Most of the time all these INITLOCAL instructions are optimized out, but sometimes, some of them might linger around, if the associated vars are not converted to ssa. If we have adjacent INITLOCAL we squash them together into a single INITLOCALS instruction that memsets more chunks at once. --- src/mono/mono/mini/interp/transform.c | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 9a9187e97ccfc4..fdbf257ee58cdb 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8851,6 +8851,40 @@ interp_fix_localloc_ret (TransformData *td) } } +static void +interp_squash_initlocals (TransformData *td) +{ + InterpInst *last_initlocal = NULL; + int last_start = 0, last_end = 0; + + for (InterpInst *ins = td->entry_bb->first_ins; ins != NULL; ins = ins->next) { + // Once we reach the real method code, we are finished with this pass + if (ins->il_offset != -1) + break; + if (ins->opcode == MINT_INITLOCAL) { + if (!last_initlocal) { + last_initlocal = ins; + last_start = get_var_offset (td, ins->dreg); + last_end = last_start + (int)ins->data [0]; + } else { + int new_start = get_var_offset (td, ins->dreg); + // We allow a maximum of 64 bytes of redundant memset when squashing initlocals + if (new_start >= last_end && new_start <= (last_end + 64)) { + last_initlocal->opcode = MINT_INITLOCALS; + last_initlocal->data [0] = GINT_TO_UINT16 (last_start); + last_end = new_start + ins->data [0]; + last_initlocal->data [1] = GINT_TO_UINT16 (last_end - last_start); + interp_clear_ins (ins); + } else { + last_initlocal = ins; + last_start = get_var_offset (td, ins->dreg); + last_end = last_start + ins->data [0]; + } + } + } + } +} + static int get_native_offset (TransformData *td, int il_offset) { @@ -8972,6 +9006,7 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG if (mono_interp_opt & INTERP_OPT_JITERPRETER) jiterp_insert_entry_points (rtm, td); #endif + interp_squash_initlocals (td); } generate_compacted_code (rtm, td); From 80e189b16f8372969f6bd7e94979fa4084e00179 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 14 Dec 2023 16:23:47 +0200 Subject: [PATCH 29/45] [mono][interp] Fix cprop dreg with newobj MINT_NEWOBJ publishes the object before the ctor is actually run. If newobj is guarded, make sure we don't publish the object to a global var, before the ctor ran successfully. --- src/mono/mono/mini/interp/transform-opt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 5d4fb25bdbd154..7f27180bea425e 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2528,6 +2528,8 @@ can_cprop_dreg (TransformData *td, InterpInst *mov_ins) return FALSE; if (td->var_values [sreg].def->opcode == MINT_DEF_ARG) return FALSE; + if (sreg_val->def->flags & INTERP_INST_FLAG_PROTECTED_NEWOBJ) + return FALSE; // reordering moves might break conversions if (td->vars [dreg].mt != td->vars [sreg].mt) return FALSE; @@ -3561,7 +3563,8 @@ interp_super_instructions (TransformData *td) InterpInst *def = get_var_value_def (td, sreg); if (def && td->var_values [sreg].ref_count == 1) { // The svar is used only for this mov. Try to get the definition to store directly instead - if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI && def->opcode != MINT_DEF_TIER_VAR) { + if (def->opcode != MINT_DEF_ARG && def->opcode != MINT_PHI && def->opcode != MINT_DEF_TIER_VAR && + !(def->flags & INTERP_INST_FLAG_PROTECTED_NEWOBJ)) { int dreg = ins->dreg; // if var is not ssa or it is a renamed fixed, then we can't replace the dreg // since there can be conflicting liveness, unless the instructions are adjacent From c34c1d9184112fdab8de47f9853075ab37168d3d Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 20 Dec 2023 12:36:40 +0200 Subject: [PATCH 30/45] [mono][interp] Fix u1 narrow simd intrinsic The opcode was overly complicated in an attempt to avoid redundant copying. The implementation was however missing the simple case where result and both arguments are allocated at the same offset. Just compute the result in a separate variable and then do the whole copying. --- src/mono/mono/mini/interp/interp-simd.c | 44 ++++--------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c index edaa7615652e52..cdc6a951c5f92a 100644 --- a/src/mono/mono/mini/interp/interp-simd.c +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -315,47 +315,15 @@ interp_v128_u2_widen_upper (gpointer res, gpointer v1) static void interp_v128_u1_narrow (gpointer res, gpointer v1, gpointer v2) { - guint8 *res_typed = (guint8*)res; + guint8 res_typed [SIZEOF_V128]; guint16 *v1_typed = (guint16*)v1; guint16 *v2_typed = (guint16*)v2; - if (res != v2) { - res_typed [0] = v1_typed [0]; - res_typed [1] = v1_typed [1]; - res_typed [2] = v1_typed [2]; - res_typed [3] = v1_typed [3]; - res_typed [4] = v1_typed [4]; - res_typed [5] = v1_typed [5]; - res_typed [6] = v1_typed [6]; - res_typed [7] = v1_typed [7]; - - res_typed [8] = v2_typed [0]; - res_typed [9] = v2_typed [1]; - res_typed [10] = v2_typed [2]; - res_typed [11] = v2_typed [3]; - res_typed [12] = v2_typed [4]; - res_typed [13] = v2_typed [5]; - res_typed [14] = v2_typed [6]; - res_typed [15] = v2_typed [7]; - } else { - res_typed [15] = v2_typed [7]; - res_typed [14] = v2_typed [6]; - res_typed [13] = v2_typed [5]; - res_typed [12] = v2_typed [4]; - res_typed [11] = v2_typed [3]; - res_typed [10] = v2_typed [2]; - res_typed [9] = v2_typed [1]; - res_typed [8] = v2_typed [0]; - - res_typed [0] = v1_typed [0]; - res_typed [1] = v1_typed [1]; - res_typed [2] = v1_typed [2]; - res_typed [3] = v1_typed [3]; - res_typed [4] = v1_typed [4]; - res_typed [5] = v1_typed [5]; - res_typed [6] = v1_typed [6]; - res_typed [7] = v1_typed [7]; - } + for (int i = 0; i < 8; i++) + res_typed [i] = v1_typed [i]; + for (int i = 0; i < 8; i++) + res_typed [i + 8] = v2_typed [i]; + memcpy (res, res_typed, SIZEOF_V128); } // GreaterThan From ec7b3a20387b84aa649682e103e3ca3e16f02840 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 20 Dec 2023 20:33:05 +0200 Subject: [PATCH 31/45] [mono][interp] Attempt to remove bblocks from EH We normally can't remove bblocks coming from EH. Before this change all try, handler and leave target bblocks were treated as roots. A reasoning for keeping them alive is that we will need to map the bblock to the native offset needed by the EH infrastructure. This can be revisited later but this commit uses a less invasive approach. We introduce the condition that EH handlers are live only if the try bblock is live. If a bblock with EH implications is not reachable, we still keep it alive but we clear all instructions from it. The bblock will just serve as a marker and will be ignored by any optimization passes. This will reduce the code size and also potentially improve performance by not having vars referenced from unreachable code. --- src/mono/mono/mini/interp/transform-opt.c | 78 ++++++++++++++++------- src/mono/mono/mini/interp/transform.c | 17 +++-- src/mono/mono/mini/interp/transform.h | 4 +- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 7f27180bea425e..8110b4e91984c1 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -554,14 +554,15 @@ interp_compute_dfs_indexes (TransformData *td) // Visit also bblocks reachable from eh handlers. These bblocks are not linked // to the main cfg (where we do dominator computation, ssa transformation etc) - for (int i = 0; i < td->header->num_clauses; i++) { - MonoExceptionClause *c = td->header->clauses + i; - InterpBasicBlock *bb = td->offset_to_bb [c->handler_offset]; - dfs_visit (bb, &dfs_index, td->bblocks); - - if (c->flags == MONO_EXCEPTION_CLAUSE_FILTER) { - bb = td->offset_to_bb [c->data.filter_offset]; - dfs_visit (bb, &dfs_index, td->bblocks); + if (td->header->num_clauses > 0) { + InterpBasicBlock *current = td->entry_bb; + while (current != NULL) { + if (current->reachable && current->dfs_index == -1) { + current->dfs_index = dfs_index; + td->bblocks [dfs_index] = current; + dfs_index++; + } + current = current->next_bb; } } td->bblocks_count_eh = dfs_index; @@ -592,8 +593,7 @@ dom_intersect (InterpBasicBlock **idoms, InterpBasicBlock *bb1, InterpBasicBlock static gboolean is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) { - // FIXME Don't mark leave target as eh_block - // g_assert (bb->dfs_index != -1); + // bblocks with uninitialized dfs_index are unreachable if (bb->dfs_index == -1) return FALSE; if (bb->dfs_index < td->bblocks_count) @@ -756,6 +756,8 @@ compute_eh_var_cb (TransformData *td, int *pvar, gpointer data) static void interp_compute_eh_vars (TransformData *td) { + // FIXME we can now remove EH bblocks. This means some vars can stop being EH vars + // EH bblocks are stored separately and are not reachable from the non-EF control flow // path. Any var reachable from EH bblocks will not be in SSA form. for (int i = td->bblocks_count; i < td->bblocks_count_eh; i++) { @@ -1533,7 +1535,7 @@ interp_unlink_bblocks (InterpBasicBlock *from, InterpBasicBlock *to) } static void -interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock *prev_bb) +interp_handle_unreachable_bblock (TransformData *td, InterpBasicBlock *bb) { for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { if (ins->opcode == MINT_LDLOCA_S) { @@ -1544,7 +1546,17 @@ interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock td->need_optimization_retry = TRUE; } } + + // If preserve is set, even if we know this bblock is unreachable, we still have to keep + // it alive (for now at least). We just remove all instructions from it in this case. + if (bb->preserve) + interp_clear_ins (ins); } +} + +static void +interp_remove_bblock (TransformData *td, InterpBasicBlock *bb, InterpBasicBlock *prev_bb) +{ while (bb->in_count) interp_unlink_bblocks (bb->in_bb [0], bb); while (bb->out_count) @@ -1599,21 +1611,16 @@ interp_mark_reachable_bblocks (TransformData *td) int cur_index = 0; int next_position = 0; - // FIXME There is no need to force eh bblocks to remain alive current = td->entry_bb; while (current != NULL) { - if (current->eh_block) { - queue [next_position++] = current; - current->reachable = TRUE; - } else { - current->reachable = FALSE; - } + current->reachable = FALSE; current = current->next_bb; } queue [next_position++] = td->entry_bb; td->entry_bb->reachable = TRUE; +retry: // We have the roots, traverse everything else while (cur_index < next_position) { current = queue [cur_index++]; @@ -1625,6 +1632,23 @@ interp_mark_reachable_bblocks (TransformData *td) } } } + + if (td->header->num_clauses) { + gboolean needs_retry = FALSE; + current = td->entry_bb; + while (current != NULL) { + if (current->try_bblock && !current->reachable && current->try_bblock->reachable) { + // Try bblock is reachable and the handler is not yet marked + queue [next_position++] = current; + current->reachable = TRUE; + needs_retry = TRUE; + } + current = current->next_bb; + } + + if (needs_retry) + goto retry; + } } /** @@ -1715,7 +1739,7 @@ interp_reorder_bblocks (TransformData *td) { InterpBasicBlock *bb; for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { - if (bb->eh_block) + if (bb->preserve) continue; // We do optimizations below where we reduce the in count of bb, but it is ideal to have // this bblock remain alive so we can correctly resolve mapping from unoptimized method. @@ -1841,11 +1865,17 @@ interp_optimize_bblocks (TransformData *td) if (!next_bb) break; if (!next_bb->reachable) { - if (td->verbose_level) - g_print ("Removed BB%d\n", next_bb->index); - interp_remove_bblock (td, next_bb, bb); - continue; - } else if (bb->out_count == 1 && bb->out_bb [0] == next_bb && next_bb->in_count == 1 && !next_bb->eh_block && !next_bb->patchpoint_data) { + interp_handle_unreachable_bblock (td, next_bb); + if (next_bb->preserve) { + if (td->verbose_level) + g_print ("Removed BB%d, cleared instructions only\n", next_bb->index); + } else { + if (td->verbose_level) + g_print ("Removed BB%d\n", next_bb->index); + interp_remove_bblock (td, next_bb, bb); + continue; + } + } else if (bb->out_count == 1 && bb->out_bb [0] == next_bb && next_bb->in_count == 1 && !next_bb->preserve && !next_bb->patchpoint_data) { g_assert (next_bb->in_bb [0] == bb); interp_merge_bblocks (td, bb, next_bb); if (td->verbose_level) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index fdbf257ee58cdb..f08036042b3982 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -4622,21 +4622,22 @@ initialize_clause_bblocks (TransformData *td) for (guint i = 0; i < header->num_clauses; i++) { MonoExceptionClause *c = header->clauses + i; - InterpBasicBlock *bb; + InterpBasicBlock *try_bb, *bb; for (uint32_t j = c->handler_offset; j < c->handler_offset + c->handler_len; j++) { if (td->clause_indexes [j] == -1) td->clause_indexes [j] = i; } - bb = td->offset_to_bb [c->try_offset]; - g_assert (bb); - bb->eh_block = TRUE; + try_bb = td->offset_to_bb [c->try_offset]; + g_assert (try_bb); + try_bb->preserve = TRUE; /* We never inline methods with clauses, so we can hard code stack heights */ bb = td->offset_to_bb [c->handler_offset]; g_assert (bb); - bb->eh_block = TRUE; + bb->preserve = TRUE; + bb->try_bblock = try_bb; if (c->flags == MONO_EXCEPTION_CLAUSE_FINALLY) { bb->stack_height = 0; @@ -4652,7 +4653,9 @@ initialize_clause_bblocks (TransformData *td) if (c->flags == MONO_EXCEPTION_CLAUSE_FILTER) { bb = td->offset_to_bb [c->data.filter_offset]; g_assert (bb); - bb->eh_block = TRUE; + bb->preserve = TRUE; + bb->try_bblock = try_bb; + bb->stack_height = 1; bb->stack_state = (StackInfo*) mono_mempool_alloc0 (td->mempool, sizeof (StackInfo)); bb->stack_state [0].type = STACK_TYPE_O; @@ -7549,7 +7552,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, } else { handle_branch (td, MINT_BR, target_offset); } - td->last_ins->info.target_bb->eh_block = TRUE; + td->last_ins->info.target_bb->preserve = TRUE; if (*td->ip == CEE_LEAVE) td->ip += 5; diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 88352a457e93e6..e418632dd87947 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -144,13 +144,15 @@ struct _InterpBasicBlock { int index; int jump_targets; + InterpBasicBlock *try_bblock; + // This will hold a list of last sequence points of incoming basic blocks SeqPoint **pred_seq_points; guint num_pred_seq_points; guint reachable : 1; // This block has special semantics and it shouldn't be optimized away - guint eh_block : 1; + guint preserve : 1; guint dead: 1; // This bblock is detectead early as being dead, we don't inline into it guint no_inlining: 1; From cc660c4f25c36941f3d81c53957032ba312d273e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 21 Dec 2023 12:34:54 +0200 Subject: [PATCH 32/45] [mono][interp] Remove bblock count limit for optimization support We were stuffing both bblock index and instruction index inside a guint32. Turns out this is naive, since even some hot code from our BCL can exceed this limit (ex SpanHelpers.IndexOfAny). Remove this limit by using a struct with a dedicated int32 for each index. It is unfortunate that we now need to allocate the index pair to the mempool separately, if we want to store it in a glist, but this is relatively uncommon. --- src/mono/mono/mini/interp/transform-opt.c | 58 +++++++++++------------ src/mono/mono/mini/interp/transform.h | 16 +++---- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 8110b4e91984c1..d81c3c12304ac9 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -841,7 +841,7 @@ get_var_value (TransformData *td, int var) // No ssa var, check if we have a def set for the current bblock if (td->var_values [var].def) { - if ((td->var_values [var].liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == td->cbb->index) + if (td->var_values [var].liveness.bb_index == td->cbb->index) return &td->var_values [var]; } return NULL; @@ -1221,14 +1221,16 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) break; } - guint32 current_liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + InterpLivenessPosition current_liveness; + current_liveness.bb_index = bb->index; + current_liveness.ins_index = 0; // Use renamed definition for sources for (; ins != NULL; ins = ins->next) { if (interp_ins_is_nop (ins)) continue; ins->flags |= INTERP_INST_FLAG_LIVENESS_MARKER; - current_liveness++; + current_liveness.ins_index++; interp_foreach_ins_svar (td, ins, NULL, rename_ins_var_cb); if (!mono_interp_op_dregs [ins->opcode] || td->vars [ins->dreg].ext_index == -1) @@ -1245,7 +1247,9 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) int renamed_var = (int)(gsize)td->renamable_vars [renamable_ext_index].ssa_stack->data; g_assert (td->vars [renamed_var].renamed_ssa_fixed); int renamed_var_ext = td->vars [renamed_var].ext_index; - td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, (gpointer)(gsize)current_liveness); + InterpLivenessPosition *liveness_ptr = (InterpLivenessPosition*)mono_mempool_alloc (td->mempool, sizeof (InterpLivenessPosition)); + *liveness_ptr = current_liveness; + td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, liveness_ptr); } ins->dreg = get_renamed_var (td, ins->dreg, FALSE); } @@ -1312,12 +1316,8 @@ rename_vars (TransformData *td) g_print ("\tLIVE LIMIT BBLOCKS: {"); GSList *live_limit_bblocks = td->renamed_fixed_vars [i].live_limit_bblocks; while (live_limit_bblocks) { - guint32 live_limit = (guint32)(gsize)live_limit_bblocks->data; - int bb_index = live_limit >> INTERP_LIVENESS_INS_INDEX_BITS; - int inst_index = live_limit & INTERP_LIVENESS_INS_INDEX_MASK; - - g_print (" (BB%d, %d)", bb_index, inst_index); - + InterpLivenessPosition *live_limit = (InterpLivenessPosition*)live_limit_bblocks->data; + g_print (" (BB%d, %d)", live_limit->bb_index, live_limit->ins_index); live_limit_bblocks = live_limit_bblocks->next; } g_print (" }\n"); @@ -2422,23 +2422,22 @@ interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpInst *i } static gboolean -can_extend_var_liveness (TransformData *td, int var, guint32 liveness) +can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition cur_liveness) { if (!td->vars [var].renamed_ssa_fixed) return TRUE; InterpRenamedFixedVar *fixed_var_ext = &td->renamed_fixed_vars [td->vars [var].ext_index]; - int cur_bb = liveness >> INTERP_LIVENESS_INS_INDEX_BITS; // If var was already live at the end of this bblocks, there is no liveness extension happening - if (fixed_var_ext->live_out_bblocks && mono_bitset_test_fast (fixed_var_ext->live_out_bblocks, cur_bb)) + if (fixed_var_ext->live_out_bblocks && mono_bitset_test_fast (fixed_var_ext->live_out_bblocks, cur_liveness.bb_index)) return TRUE; GSList *bb_liveness = fixed_var_ext->live_limit_bblocks; while (bb_liveness) { - guint32 liveness_limit = (guint32)(gsize)bb_liveness->data; - if (cur_bb == (liveness_limit >> INTERP_LIVENESS_INS_INDEX_BITS)) { - if (liveness <= liveness_limit) + InterpLivenessPosition *liveness_limit = (InterpLivenessPosition*)bb_liveness->data; + if (cur_liveness.bb_index == liveness_limit->bb_index) { + if (cur_liveness.ins_index <= liveness_limit->ins_index) return TRUE; else return FALSE; @@ -2476,7 +2475,7 @@ replace_svar_uses (TransformData *td, InterpInst *first, InterpInst *last, int o } static void -cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liveness) +cprop_svar (TransformData *td, InterpInst *ins, int *pvar, InterpLivenessPosition current_liveness) { int var = *pvar; if (var_has_indirects (td, var)) @@ -2499,7 +2498,7 @@ cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liven can_cprop = can_extend_var_liveness (td, cprop_var, current_liveness); } else { InterpVarValue *cprop_var_val = get_var_value (td, cprop_var); - gboolean var_def_in_cur_bb = (val->liveness >> INTERP_LIVENESS_INS_INDEX_BITS) == td->cbb->index; + gboolean var_def_in_cur_bb = val->liveness.bb_index == td->cbb->index; if (!var_def_in_cur_bb) { // var definition was not in current bblock so it might no longer contain // the current value of cprop_var because cprop_var is not in ssa form and @@ -2514,7 +2513,8 @@ cprop_svar (TransformData *td, InterpInst *ins, int *pvar, guint32 current_liven // Previously in this bblock, var is recorded as having the value of cprop_var and // cprop_var is defined in the current bblock. This means that var will contain the // value of cprop_var only if last known cprop_var redefinition was before the var definition. - can_cprop = cprop_var_val->liveness < val->liveness; + g_assert (cprop_var_val->liveness.bb_index == val->liveness.bb_index); + can_cprop = cprop_var_val->liveness.ins_index < val->liveness.ins_index; } } @@ -2554,7 +2554,7 @@ can_cprop_dreg (TransformData *td, InterpInst *mov_ins) if (!sreg_val) return FALSE; // We only apply this optimization if the definition is in the same bblock as this use - if ((sreg_val->liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != td->cbb->index) + if (sreg_val->liveness.bb_index != td->cbb->index) return FALSE; if (td->var_values [sreg].def->opcode == MINT_DEF_ARG) return FALSE; @@ -2567,9 +2567,9 @@ can_cprop_dreg (TransformData *td, InterpInst *mov_ins) if (var_is_ssa_form (td, sreg)) { // check if dreg is a renamed ssa fixed var (likely to remain alive) if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed) { - int last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; - if ((last_use_liveness >> INTERP_LIVENESS_INS_INDEX_BITS) != td->cbb->index || - sreg_val->liveness >= last_use_liveness) { + InterpLivenessPosition last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; + if (last_use_liveness.bb_index != td->cbb->index || + sreg_val->liveness.ins_index >= last_use_liveness.ins_index) { // No other conflicting renamed fixed vars (of dreg) are used in this bblock, or their // last use predates the definition. This means we can tweak def of sreg to store directly // into dreg and patch all intermediary instructions to use dreg instead. @@ -2610,7 +2610,9 @@ interp_cprop (TransformData *td) g_string_free (bb_info, TRUE); } - guint32 current_liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + InterpLivenessPosition current_liveness; + current_liveness.bb_index = bb->index; + current_liveness.ins_index = 0; // Set cbb since we do some instruction inserting below td->cbb = bb; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { @@ -2619,7 +2621,7 @@ interp_cprop (TransformData *td) gint32 dreg; // LIVENESS_MARKER is set only for non-eh bblocks if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) - current_liveness++; + current_liveness.ins_index++; if (interp_ins_is_nop (ins)) continue; @@ -3182,7 +3184,7 @@ interp_super_instructions (TransformData *td) InterpVarValue *dval = &td->var_values [ins->dreg]; dval->type = VAR_VALUE_NONE; dval->def = ins; - dval->liveness = bb->index << INTERP_LIVENESS_INS_INDEX_BITS; + dval->liveness.bb_index = bb->index; // only to check if defined in current bblock } if (opcode == MINT_RET || (opcode >= MINT_RET_I1 && opcode <= MINT_RET_U2)) { // ldc + ret -> ret.imm @@ -3647,10 +3649,6 @@ interp_prepare_no_ssa_opt (TransformData *td) void interp_optimize_code (TransformData *td) { - // Give up on huge methods. We can easily work around this if decide to care. - if (td->bb_count > ((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1)) - return; - if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index e418632dd87947..bf7d982d935d0a 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -18,11 +18,6 @@ // are added in the code, since new instructions won't have this flag set. #define INTERP_INST_FLAG_LIVENESS_MARKER 256 -#define INTERP_LIVENESS_INS_INDEX_BITS 18 -#define INTERP_LIVENESS_BB_INDEX_BITS (8 * sizeof (gint32) - INTERP_LIVENESS_INS_INDEX_BITS) -#define INTERP_LIVENESS_INS_INDEX_MASK ((1 << INTERP_LIVENESS_INS_INDEX_BITS) - 1) -#define INTERP_LIVENESS_BB_INDEX_MASK (((1 << INTERP_LIVENESS_BB_INDEX_BITS) - 1) << INTERP_LIVENESS_INS_INDEX_BITS) - typedef struct _InterpInst InterpInst; typedef struct _InterpBasicBlock InterpBasicBlock; typedef struct _InterpCallInfo InterpCallInfo; @@ -51,6 +46,11 @@ typedef struct #define VAR_VALUE_NON_NULL 5 #define VAR_VALUE_COUNT 6 +typedef struct { + guint32 bb_index; + guint32 ins_index; +} InterpLivenessPosition; + typedef struct { // Indicates the type of the stored information. It can be another var or a constant int type; @@ -64,7 +64,7 @@ typedef struct { // The instruction that writes this local. InterpInst *def; // Liveness marker of the definition - guint32 liveness; + InterpLivenessPosition liveness; // The number of times this var is referenced. After optimizations // this can become 0, in which case we can clear the def instruction. int ref_count; @@ -237,7 +237,7 @@ typedef struct { // This liveness is bblock only. It is used during cprop to determine whether we // can move the definition of a renamed fixed var earlier (if there are no conflicts with // other renamed vars from the same var) - guint32 last_use_liveness; + InterpLivenessPosition last_use_liveness; // Var that is global and might take part in phi opcodes guint ssa_global : 1; @@ -255,7 +255,7 @@ typedef struct { // Bit set of bblocks where the renamed var is live at the bb end // This means that within these bblocks we can freely increase the var liveness MonoBitSet *live_out_bblocks; - // This is a list of (bb_index, inst_index), that indicates that in bblock with + // This is a list of InterpLivenessPosition*, that indicates that in bblock with // index bb_index, the var can have its liveness extended to at most inst_index GSList *live_limit_bblocks; } InterpRenamedFixedVar; From 3e2ee711b9693c161fe5b6a13ff23815325c30c6 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Mon, 25 Dec 2023 21:37:12 +0200 Subject: [PATCH 33/45] [mono][interp] Improve native offset estimation The condition for native offset estimation is for it to be conservative, meaning it is bigger or equal than the real code size that is generated. When checking if we need to introduce a long opcode, we were comparing the real offset of the bblock with the estimation of the target bblock. The problem is that in huge methods, the estimation error could accumulate and we would end up with long opcodes. We make the estimation more precise by accounting for the accumulated error. This actually uncovered an existing bug. For branch superinstructions we are missing the long version, so adding a long offset relocation leads to broken code. --- src/mono/mono/mini/interp/mintops.h | 1 + src/mono/mono/mini/interp/transform.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index b75d321479111d..73b767daccb094 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -209,6 +209,7 @@ typedef enum { #define MINT_SWITCH_LEN(n) (4 + (n) * 2) #define MINT_IS_NOP(op) ((op) == MINT_NOP || (op) == MINT_DEF || (op) == MINT_DEF_ARG || (op) == MINT_DUMMY_USE || (op) == MINT_IL_SEQ_POINT) +#define MINT_IS_EMIT_NOP(op) ((op) == MINT_NOP || (op) == MINT_DEF || (op) == MINT_DEF_ARG || (op) == MINT_DEF_TIER_VAR || (op) == MINT_DUMMY_USE) #define MINT_IS_MOV(op) ((op) >= MINT_MOV_I4_I1 && (op) <= MINT_MOV_VT) #define MINT_IS_UNCONDITIONAL_BRANCH(op) ((op) >= MINT_BR && (op) <= MINT_CALL_HANDLER_S) #define MINT_IS_CONDITIONAL_BRANCH(op) ((op) >= MINT_BRFALSE_I4 && (op) <= MINT_BLT_UN_R8_S) diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index f08036042b3982..28522da0228542 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -8378,7 +8378,7 @@ interp_compute_native_offset_estimates (TransformData *td) for (ins = bb->first_ins; ins != NULL; ins = ins->next) { int opcode = ins->opcode; // Skip dummy opcodes for more precise offset computation - if (MINT_IS_NOP (opcode)) + if (MINT_IS_EMIT_NOP (opcode)) continue; noe += interp_get_ins_length (ins); if (!td->optimized) @@ -8466,7 +8466,7 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in g_array_append_val (td->line_numbers, lne); } - if (opcode == MINT_NOP || opcode == MINT_DEF || opcode == MINT_DEF_TIER_VAR || opcode == MINT_DEF_ARG || opcode == MINT_DUMMY_USE) + if (MINT_IS_EMIT_NOP (opcode)) return ip; *ip++ = opcode; @@ -8519,9 +8519,14 @@ emit_compacted_instruction (TransformData *td, guint16* start_ip, InterpInst *in ip--; } else { // If the estimate offset is short, then surely the real offset is short - gboolean is_short = interp_is_short_offset (br_offset, ins->info.target_bb->native_offset_estimate); + // otherwise we conservatively have to use long branch opcodes + int cur_estimation_error = td->cbb->native_offset_estimate - td->cbb->native_offset; + int target_bb_estimated_offset = ins->info.target_bb->native_offset_estimate - cur_estimation_error; + gboolean is_short = interp_is_short_offset (br_offset, target_bb_estimated_offset); if (is_short) *start_ip = GINT_TO_OPCODE (get_short_brop (opcode)); + else + g_assert (!MINT_IS_SUPER_BRANCH (opcode)); // FIXME missing handling for long branch // We don't know the in_offset of the target, add a reloc Reloc *reloc = (Reloc*)mono_mempool_alloc0 (td->mempool, sizeof (Reloc)); @@ -8752,6 +8757,7 @@ generate_compacted_code (InterpMethod *rtm, TransformData *td) for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins = bb->first_ins; bb->native_offset = GPTRDIFF_TO_INT (ip - td->new_code); + g_assert (bb->native_offset <= bb->native_offset_estimate); td->cbb = bb; #if HOST_BROWSER From 10851ef6704670ad6499c5fb4108f9712fd9b51f Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Sat, 23 Dec 2023 18:45:09 +0200 Subject: [PATCH 34/45] [mono][interp] Fix live limit bblocks computation for fixed ssa vars Consider the following code pattern: BB1: x0 = v1 () y = x0 condbr BB3 BB2: x1 = v2 () BB3: print y; Assume that there is some other use of x var such that it has to be a ssa fixed var. BB1 dominates both BB2 and BB3. BB3 is part of the dfrontier of BB2, but since x is not live in BB3 and we generate pruned ssa, there will be no PHI node added in BB3. In BB2, we will have a liveness limit for var x0 up until the point where x1 is set. When renaming vars in BB3, we have x0 as the current renamed var for x. Since x is not redefined in BB3, we were naively considering that the value of x0, which was currently on the ssa stack, can be readily used. This would lead to the propagation of value x0 into BB3 which is obviously incorrect, since if we enter from BB2 the value of x0 is overwritten by the x1 value. This problem is resolved by introducing a new type of phi opcode. While in BB3 we weren't inserting any phi node because x is not live, we now do insert a dead_phi. This is a lighter weight phi node, which is only used for the computation of liveness limit. When renaming vars in BB3, we detect a dead_phi for var x, which basically means that if we were to propagate the use of x in this bblock, then we would actually need to insert a real phi node. Therefore no propagation of value x0 is allowed in BB3. Unlike normal phi nodes, dead_phi nodes don't force a var to be ssa fixed. In the above example, if var x is not actually ssa fixed, even though we insert a dead_phi node, we will still end up with two separated x0 and x1 vars and we will be able to propagate var x0 directly into BB3. --- src/mono/mono/mini/interp/mintops.def | 1 + src/mono/mono/mini/interp/transform-opt.c | 84 ++++++++++++++++------- 2 files changed, 61 insertions(+), 24 deletions(-) diff --git a/src/mono/mono/mini/interp/mintops.def b/src/mono/mono/mini/interp/mintops.def index 41da680ee47ba8..d45579ce40fe39 100644 --- a/src/mono/mono/mini/interp/mintops.def +++ b/src/mono/mono/mini/interp/mintops.def @@ -843,6 +843,7 @@ IROPDEF(MINT_TIER_PATCHPOINT_DATA, "tier_patchpoint_data", 2, 0, 0, MintOpShortI IROPDEF(MINT_MOV_SRC_OFF, "mov.src.off", 6, 1, 1, MintOpTwoShorts) IROPDEF(MINT_MOV_DST_OFF, "mov.dst.off", 8, 1, 2, MintOpTwoShorts) IROPDEF(MINT_PHI, "phi", 2, 1, 0, MintOpNoArgs) +IROPDEF(MINT_DEAD_PHI, "dead_phi", 1, 0, 0, MintOpNoArgs) IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA1, "intrins_marvin_block_ssa1", 4, 1, 2, MintOpNoArgs) IROPDEF(MINT_INTRINS_MARVIN_BLOCK_SSA2, "intrins_marvin_block_ssa2", 4, 1, 2, MintOpNoArgs) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index d81c3c12304ac9..626809b078b07d 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1039,7 +1039,7 @@ bb_has_phi (InterpBasicBlock *bb, int var) { InterpInst *ins = bb->first_ins; while (ins) { - if (ins->opcode == MINT_PHI) { + if (ins->opcode == MINT_PHI || ins->opcode == MINT_DEAD_PHI) { if (ins->dreg == var) return TRUE; } else { @@ -1089,9 +1089,19 @@ insert_phi_nodes (TransformData *td) mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { InterpBasicBlock *bd = td->bblocks [j]; g_assert (is_bblock_ssa_cfg (td, bb)); - if (!bb_has_phi (bd, var) && mono_bitset_test_fast (bd->live_in_set, i)) { - td->renamable_vars [i].ssa_fixed = TRUE; - bb_insert_phi (td, bd, var); + if (!bb_has_phi (bd, var)) { + if (mono_bitset_test_fast (bd->live_in_set, i)) { + td->renamable_vars [i].ssa_fixed = TRUE; + bb_insert_phi (td, bd, var); + } else { + // We need this only for vars that are ssa fixed, but it is not clear + // if the current var is fixed or not. We will ignore these opcodes if + // the var is not actually ssa fixed. + InterpInst *phi = interp_insert_ins_bb (td, bd, NULL, MINT_DEAD_PHI); + if (td->verbose_level) + g_print ("BB%d NEW_DEAD_PHI %d, def from BB%d\n", bd->index, var, bb->index); + phi->dreg = var; + } if (!g_slist_find (workset, bd)) workset = g_slist_prepend (workset, bd); } @@ -1180,8 +1190,11 @@ rename_ins_var_cb (TransformData *td, int *pvar, gpointer data) { int var = *pvar; int ext_index = td->vars [var].ext_index; - if (ext_index != -1) - *pvar = (int)(gsize)td->renamable_vars [ext_index].ssa_stack->data; + if (ext_index != -1) { + int renamed_var = (int)(gsize)td->renamable_vars [ext_index].ssa_stack->data; + g_assert (renamed_var != -1); + *pvar = renamed_var; + } } static void @@ -1201,7 +1214,9 @@ rename_phi_args_in_out_bbs (TransformData *td, InterpBasicBlock *bb) int ext_index = td->vars [var].ext_index; GSList *stack = td->renamable_vars [ext_index].ssa_stack; ins->info.args [aindex] = (int)(gsize)stack->data; - } else { + } else if (ins->opcode == MINT_DEAD_PHI) { + continue; + } else if (ins->opcode != MINT_NOP) { break; } } @@ -1215,10 +1230,21 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) // Rename vars defined with MINT_PHI for (ins = bb->first_ins; ins != NULL; ins = ins->next) { - if (ins->opcode == MINT_PHI) + if (ins->opcode == MINT_PHI) { ins->dreg = get_renamed_var (td, ins->dreg, FALSE); - else + } else if (ins->opcode == MINT_DEAD_PHI) { + int ext_index = td->vars [ins->dreg].ext_index; + if (td->renamable_vars [ext_index].ssa_fixed) { + // we push an invalid var that will be just a marker for marking var live limits + td->renamable_vars [ext_index].ssa_stack = g_slist_prepend (td->renamable_vars [ext_index].ssa_stack, (gpointer)(gsize)-1); + } else { + if (td->verbose_level) + g_print ("BB%d CLEAR_DEAD_PHI %d\n", bb->index, ins->dreg); + interp_clear_ins (ins); + } + } else { break; + } } InterpLivenessPosition current_liveness; @@ -1227,7 +1253,7 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) // Use renamed definition for sources for (; ins != NULL; ins = ins->next) { - if (interp_ins_is_nop (ins)) + if (interp_ins_is_nop (ins) || ins->opcode == MINT_DEAD_PHI) continue; ins->flags |= INTERP_INST_FLAG_LIVENESS_MARKER; current_liveness.ins_index++; @@ -1245,11 +1271,13 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) td->renamable_vars [renamable_ext_index].ssa_stack) { // Mark the exact liveness end limit for the ssa fixed var that is overwritten (the old entry on the stack) int renamed_var = (int)(gsize)td->renamable_vars [renamable_ext_index].ssa_stack->data; - g_assert (td->vars [renamed_var].renamed_ssa_fixed); - int renamed_var_ext = td->vars [renamed_var].ext_index; - InterpLivenessPosition *liveness_ptr = (InterpLivenessPosition*)mono_mempool_alloc (td->mempool, sizeof (InterpLivenessPosition)); - *liveness_ptr = current_liveness; - td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, liveness_ptr); + if (renamed_var != -1) { + g_assert (td->vars [renamed_var].renamed_ssa_fixed); + int renamed_var_ext = td->vars [renamed_var].ext_index; + InterpLivenessPosition *liveness_ptr = (InterpLivenessPosition*)mono_mempool_alloc (td->mempool, sizeof (InterpLivenessPosition)); + *liveness_ptr = current_liveness; + td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, liveness_ptr); + } } ins->dreg = get_renamed_var (td, ins->dreg, FALSE); } @@ -1269,26 +1297,34 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) for (unsigned int i = 0; i < td->renamable_vars_size; i++) { if (td->renamable_vars [i].ssa_fixed && td->renamable_vars [i].ssa_stack) { int renamed_var = (int)(gsize)td->renamable_vars [i].ssa_stack->data; - g_assert (td->vars [renamed_var].renamed_ssa_fixed); - int renamed_var_ext = td->vars [renamed_var].ext_index; - if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { - gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bb_count, 0)); - td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bb_count, 0); - } + if (renamed_var != -1) { + g_assert (td->vars [renamed_var].renamed_ssa_fixed); + int renamed_var_ext = td->vars [renamed_var].ext_index; + if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { + gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bb_count, 0)); + td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bb_count, 0); + } - mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->index); + mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->index); + } } } // Pop from the stack any new vars defined in this bblock for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + int ext_index = -1; if (mono_interp_op_dregs [ins->opcode]) { - int ext_index = td->vars [ins->dreg].ext_index; + ext_index = td->vars [ins->dreg].ext_index; if (ext_index == -1) continue; if (td->vars [ins->dreg].renamed_ssa_fixed) ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; + } else if (ins->opcode == MINT_DEAD_PHI) { + ext_index = td->vars [ins->dreg].ext_index; + interp_clear_ins (ins); + } + if (ext_index != -1) { GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; td->renamable_vars [ext_index].ssa_stack = prev_head->next; g_free (prev_head); @@ -1383,7 +1419,7 @@ interp_exit_ssa (TransformData *td) for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { InterpInst *ins; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { - if (ins->opcode == MINT_PHI) + if (ins->opcode == MINT_PHI || ins->opcode == MINT_DEAD_PHI) ins->opcode = MINT_NOP; else interp_foreach_ins_var (td, ins, NULL, revert_ssa_rename_cb); From 0ab313b0c5cd628cd6b0e52b58d3eaef187a68c7 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 27 Dec 2023 12:55:19 +0200 Subject: [PATCH 35/45] [mono][interp] Reduce number of renamable vars Renamable vars are vars that have multiple definitions. SSA global vars are renamable vars that are used in multiple bblocks, that require additional computations (for inserting phi for example). Before this commit we were computing both renamable vars and ssa globals in one code iteration. We were naively marking vars as renamable if the use bblock was different from the define bblock. We now do it only if there are actually multiple definitions of the var. This reduces total number of renamable vars to about half, significantly improving compilation speed for massive methods. In order to mark ssa global vars, we need to do another simple iteration. On SRT suite about 85% of renamable vars are ssa global. It is not obvious how useful it is to have special handling for them, but computing them is also very cheap. --- src/mono/mono/mini/interp/transform-opt.c | 26 +++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 626809b078b07d..e6be76d398cd2b 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -872,14 +872,12 @@ compute_global_var_cb (TransformData *td, int *pvar, gpointer data) int var = *pvar; InterpBasicBlock *bb = (InterpBasicBlock*)data; InterpVar *var_data = &td->vars [var]; - if (!var_is_ssa_form (td, var)) + if (!var_is_ssa_form (td, var) || td->vars [var].ext_index == -1) return; // If var is used in another block than the one that it is declared then mark it as global - if (var_data->declare_bbs) { - if (var_data->declare_bbs->data != bb || var_data->declare_bbs->next) { - int ext_index = interp_create_renamable_var (td, var); - td->renamable_vars [ext_index].ssa_global = TRUE; - } + if (var_data->declare_bbs && var_data->declare_bbs->data != bb) { + int ext_index = td->vars [var].ext_index; + td->renamable_vars [ext_index].ssa_global = TRUE; } } @@ -893,21 +891,31 @@ interp_compute_global_vars (TransformData *td) continue; InterpInst *ins; for (ins = bb->first_ins; ins != NULL; ins = ins->next) { - interp_foreach_ins_svar (td, ins, bb, compute_global_var_cb); if (mono_interp_op_dregs [ins->opcode] && var_is_ssa_form (td, ins->dreg)) { // Save the list of bblocks where a global var is defined in InterpVar *var_data = &td->vars [ins->dreg]; if (!var_data->declare_bbs) { var_data->declare_bbs = g_slist_prepend (NULL, bb); } else { - interp_create_renamable_var (td, ins->dreg); - if (!g_slist_find (var_data->declare_bbs, bb)) + int ext_index = interp_create_renamable_var (td, ins->dreg); + if (!g_slist_find (var_data->declare_bbs, bb)) { + // Var defined in multiple bblocks, it is ssa global var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); + td->renamable_vars [ext_index].ssa_global = TRUE; + } } } } } + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + if (!is_bblock_ssa_cfg (td, bb)) + continue; + InterpInst *ins; + for (ins = bb->first_ins; ins != NULL; ins = ins->next) + interp_foreach_ins_svar (td, ins, bb, compute_global_var_cb); + } + if (td->verbose_level) { g_print ("\nSSA GLOBALS:\n"); for (unsigned int i = 0; i < td->renamable_vars_size; i++) { From 3fcf0db007e56645af9c0312c0310cafde3ced5e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 28 Dec 2023 12:53:51 +0200 Subject: [PATCH 36/45] [mono][interp] Remove MINT_NOPS after each round of optimizations Since it was easier and we still needed to have some instructions around (instruction with il_offset marker), instead of unlinking an instruction when it is optimized out we were just replacing it with a MINT_NOP. After a round of optimization, most of the instructions become nops, so make sure to unlink them so we don't pay the cost of iterating in future passes. --- src/mono/mono/mini/interp/transform-opt.c | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index e6be76d398cd2b..9c9931ca71c514 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3690,6 +3690,38 @@ interp_prepare_no_ssa_opt (TransformData *td) td->bblocks_count_eh = i; } +static void +interp_remove_ins (InterpBasicBlock *bb, InterpInst *ins) +{ + if (ins->next) + ins->next->prev = ins->prev; + else + bb->last_ins = ins->prev; + + if (ins->prev) + ins->prev->next = ins->next; + else + bb->first_ins = ins->next; +} + +static void +interp_remove_nops (TransformData *td) +{ + InterpBasicBlock *bb; + for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { + InterpInst *ins; + for (ins = bb->first_ins; ins != NULL; ins = ins->next) { + if (ins->opcode == MINT_NOP && ins->prev && + (ins->il_offset == -1 || + ins->prev->il_offset == ins->il_offset)) { + // This is a NOP instruction that has no relevant il_offset, actually remove it + interp_remove_ins (bb, ins); + } + + } + } +} + void interp_optimize_code (TransformData *td) { @@ -3721,6 +3753,8 @@ interp_optimize_code (TransformData *td) if (!td->disable_ssa) interp_exit_ssa (td); + interp_remove_nops (td); + if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); From 22b644480a12665860cdea076e675e792ed458bd Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 28 Dec 2023 16:13:53 +0200 Subject: [PATCH 37/45] [mono][interp] Optimize generation of MINT_DEAD_PHI In complex methods we tend to generate massive amounts of these opcodes, this being very slow and memory consuming. Account for this by generating a single instruction with a compact bit set instead. --- src/mono/mono/mini/interp/transform-opt.c | 73 +++++++++++++++-------- src/mono/mono/mini/interp/transform.h | 1 + 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 9c9931ca71c514..949e5621d263d8 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1043,13 +1043,18 @@ interp_compute_pruned_ssa_liveness (TransformData *td) } static gboolean -bb_has_phi (InterpBasicBlock *bb, int var) +bb_has_phi (TransformData *td, InterpBasicBlock *bb, int var) { InterpInst *ins = bb->first_ins; while (ins) { - if (ins->opcode == MINT_PHI || ins->opcode == MINT_DEAD_PHI) { + if (ins->opcode == MINT_PHI) { if (ins->dreg == var) return TRUE; + } else if (ins->opcode == MINT_DEAD_PHI) { + MonoBitSet *bitset = ins->info.dead_phi_vars; + int ext_index = td->vars [var].ext_index; + if (mono_bitset_test_fast (bitset, ext_index)) + return TRUE; } else { // if we have a phi it is at the start of the bb return FALSE; @@ -1062,7 +1067,11 @@ bb_has_phi (InterpBasicBlock *bb, int var) static void bb_insert_phi (TransformData *td, InterpBasicBlock *bb, int var) { - InterpInst *phi = interp_insert_ins_bb (td, bb, NULL, MINT_PHI); + InterpInst *first_ins = NULL; + // We keep dead phi as first instruction so we can find it quickly + if (bb->first_ins && bb->first_ins->opcode == MINT_DEAD_PHI) + first_ins = bb->first_ins; + InterpInst *phi = interp_insert_ins_bb (td, bb, first_ins, MINT_PHI); if (td->verbose_level) g_print ("BB%d NEW_PHI %d\n", bb->index, var); @@ -1074,6 +1083,24 @@ bb_insert_phi (TransformData *td, InterpBasicBlock *bb, int var) phi->info.args [i] = -1; } +static void +bb_insert_dead_phi (TransformData *td, InterpBasicBlock *bb, int var) +{ + MonoBitSet *bitset; + if (bb->first_ins && bb->first_ins->opcode == MINT_DEAD_PHI) { + bitset = bb->first_ins->info.dead_phi_vars; + } else { + InterpInst *phi = interp_insert_ins_bb (td, bb, NULL, MINT_DEAD_PHI); + gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->renamable_vars_size, 0)); + phi->info.dead_phi_vars = bitset = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); + } + int ext_index = td->vars [var].ext_index; + mono_bitset_set_fast (bitset, ext_index); + if (td->verbose_level) + g_print ("BB%d NEW_DEAD_PHI %d\n", bb->index, var); + +} + static void insert_phi_nodes (TransformData *td) { @@ -1097,7 +1124,7 @@ insert_phi_nodes (TransformData *td) mono_bitset_foreach_bit (bb->dfrontier, j, td->bb_count) { InterpBasicBlock *bd = td->bblocks [j]; g_assert (is_bblock_ssa_cfg (td, bb)); - if (!bb_has_phi (bd, var)) { + if (!bb_has_phi (td, bd, var)) { if (mono_bitset_test_fast (bd->live_in_set, i)) { td->renamable_vars [i].ssa_fixed = TRUE; bb_insert_phi (td, bd, var); @@ -1105,10 +1132,7 @@ insert_phi_nodes (TransformData *td) // We need this only for vars that are ssa fixed, but it is not clear // if the current var is fixed or not. We will ignore these opcodes if // the var is not actually ssa fixed. - InterpInst *phi = interp_insert_ins_bb (td, bd, NULL, MINT_DEAD_PHI); - if (td->verbose_level) - g_print ("BB%d NEW_DEAD_PHI %d, def from BB%d\n", bd->index, var, bb->index); - phi->dreg = var; + bb_insert_dead_phi (td, bd, var); } if (!g_slist_find (workset, bd)) workset = g_slist_prepend (workset, bd); @@ -1241,14 +1265,12 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) if (ins->opcode == MINT_PHI) { ins->dreg = get_renamed_var (td, ins->dreg, FALSE); } else if (ins->opcode == MINT_DEAD_PHI) { - int ext_index = td->vars [ins->dreg].ext_index; - if (td->renamable_vars [ext_index].ssa_fixed) { - // we push an invalid var that will be just a marker for marking var live limits - td->renamable_vars [ext_index].ssa_stack = g_slist_prepend (td->renamable_vars [ext_index].ssa_stack, (gpointer)(gsize)-1); - } else { - if (td->verbose_level) - g_print ("BB%d CLEAR_DEAD_PHI %d\n", bb->index, ins->dreg); - interp_clear_ins (ins); + unsigned int ext_index; + mono_bitset_foreach_bit (ins->info.dead_phi_vars, ext_index, td->renamable_vars_size) { + if (td->renamable_vars [ext_index].ssa_fixed) { + // we push an invalid var that will be just a marker for marking var live limits + td->renamable_vars [ext_index].ssa_stack = g_slist_prepend (td->renamable_vars [ext_index].ssa_stack, (gpointer)(gsize)-1); + } } } else { break; @@ -1320,22 +1342,25 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) // Pop from the stack any new vars defined in this bblock for (ins = bb->first_ins; ins != NULL; ins = ins->next) { - int ext_index = -1; if (mono_interp_op_dregs [ins->opcode]) { - ext_index = td->vars [ins->dreg].ext_index; + int ext_index = td->vars [ins->dreg].ext_index; if (ext_index == -1) continue; if (td->vars [ins->dreg].renamed_ssa_fixed) ext_index = td->renamed_fixed_vars [ext_index].renamable_var_ext_index; - } else if (ins->opcode == MINT_DEAD_PHI) { - ext_index = td->vars [ins->dreg].ext_index; - interp_clear_ins (ins); - } - - if (ext_index != -1) { GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; td->renamable_vars [ext_index].ssa_stack = prev_head->next; g_free (prev_head); + } else if (ins->opcode == MINT_DEAD_PHI) { + unsigned int ext_index; + mono_bitset_foreach_bit (ins->info.dead_phi_vars, ext_index, td->renamable_vars_size) { + if (td->renamable_vars [ext_index].ssa_fixed) { + GSList *prev_head = td->renamable_vars [ext_index].ssa_stack; + td->renamable_vars [ext_index].ssa_stack = prev_head->next; + g_free (prev_head); + } + } + interp_clear_ins (ins); } } } diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index bf7d982d935d0a..a49c53a7f6f2bb 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -88,6 +88,7 @@ struct _InterpInst { InterpBasicBlock **target_bb_table; InterpCallInfo *call_info; int *args; // for variable number of args, used only for phi + MonoBitSet *dead_phi_vars; // only for MINT_DEAD_PHI } info; // Variable data immediately following the dreg/sreg information. This is represented exactly // in the final code stream as in this array. From fcb91d090118aea0d219e918be89c6f71fa81683 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 28 Dec 2023 16:38:01 +0200 Subject: [PATCH 38/45] [mono][interp] Disable SSA for first optimization iteration of huge methods SSA transformations are very expensive if we have many bblocks / vars. Run optimizations first with ssa disabled, so when we run SSA transformation, the code is already significantly simplified. --- src/mono/mono/mini/interp/transform-opt.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 949e5621d263d8..a832a058c8db1e 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3756,6 +3756,18 @@ interp_optimize_code (TransformData *td) if (!(mono_interp_opt & INTERP_OPT_SSA)) td->disable_ssa = TRUE; + gboolean ssa_enabled_retry = FALSE; + + if (!td->disable_ssa && td->bb_count > 1000) { + // We have ssa enabled but we are compiling a huge method. Do the first iteration + // in ssa disabled mode. This should greatly simplify the CFG and the code, so the + // following iteration with SSA transformation enabled is much faster. In general, + // for huge methods we end up doing multiple optimization iterations anyway. + ssa_enabled_retry = TRUE; + td->disable_ssa = TRUE; + if (td->verbose_level) + g_print ("Huge method. SSA disabled for first iteration\n"); + } optimization_retry: td->need_optimization_retry = FALSE; @@ -3783,9 +3795,15 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); - if (td->need_optimization_retry) { + if (ssa_enabled_retry) { + ssa_enabled_retry = FALSE; + td->disable_ssa = FALSE; + if (td->verbose_level) + g_print ("Retry optimization with SSA enabled\n"); + goto optimization_retry; + } else if (td->need_optimization_retry) { if (td->verbose_level) - g_print ("Retry method %s\n", mono_method_full_name (td->method, 1)); + g_print ("Retry optimization\n"); goto optimization_retry; } From f4629663baa4ae51cb01069a4a38a84369c2a4bc Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 9 Jan 2024 18:17:28 +0200 Subject: [PATCH 39/45] [mono][interp] Fix adding of super instructions Super-instructions combine two instructions into one, if the first instruction has a ref-count of 1, generating a single instruction that does more operations. The problem is that the source vars of the first instructions have their liveness extended to the point of the second instruction. This can be incorrect if, for example, one of the vars is redefined between the 2 instrutions. Problems can also arise if the vars are ssa-fixed, in which case we can't freely extend the liveness. We now do the same liveness extension checks as in the cprop scenario. --- src/mono/mono/mini/interp/transform-opt.c | 142 ++++++++++++++-------- 1 file changed, 93 insertions(+), 49 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index a832a058c8db1e..f67aafb7dd521a 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -2491,7 +2491,7 @@ interp_fold_simd_create (TransformData *td, InterpBasicBlock *cbb, InterpInst *i } static gboolean -can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition cur_liveness) +can_extend_ssa_var_liveness (TransformData *td, int var, InterpLivenessPosition cur_liveness) { if (!td->vars [var].renamed_ssa_fixed) return TRUE; @@ -2518,6 +2518,41 @@ can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition cur_ return FALSE; } +// We are attempting to extend liveness of var to cur_liveness (propagate its use). +// We know that var was still alive at the point of original_liveness. +// cur_liveness is in td->cbb +static gboolean +can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition original_liveness, InterpLivenessPosition cur_liveness) +{ + if (var_is_ssa_form (td, var)) { + // If var is fixed ssa, we can extend liveness if it doesn't overlap with other renamed + // vars. If var is normal ssa, we can extend its liveness with no constraints + return can_extend_ssa_var_liveness (td, var, cur_liveness); + } else { + gboolean original_in_curbb = original_liveness.bb_index == td->cbb->index; + if (!original_in_curbb) { + // var is not in ssa form and we only track its value within a single bblock. + // The original liveness information is not in cbb and, by the time we get to cbb, + // its value could be different so we can't use it. + return FALSE; + } else { + InterpVarValue *var_val = get_var_value (td, var); + if (!var_val) { + // We know that var is alive at original_liveness, which is in cbb, and that + // the var has not been defined yet in cbb, meaning its value was not overwritten + // and we can use it. + return TRUE; + } else { + // We know that var is alive at original_liveness, which is in cbb, and that + // the var has been redefined in cbb. We can extend its liveness to cur_liveness, + // only if it hasn't been redefined between original and cur liveness. + g_assert (var_val->liveness.bb_index == original_liveness.bb_index); + return var_val->liveness.ins_index < original_liveness.ins_index; + } + } + } +} + static void replace_svar_use (TransformData *td, int *pvar, gpointer data) { @@ -2559,44 +2594,15 @@ cprop_svar (TransformData *td, InterpInst *ins, int *pvar, InterpLivenessPositio if (td->vars [var].renamed_ssa_fixed && !td->vars [cprop_var].renamed_ssa_fixed) { // ssa fixed vars are likely to live, keep using them val->ref_count++; + } else if (can_extend_var_liveness (td, cprop_var, val->liveness, current_liveness)) { + if (td->verbose_level) + g_print ("cprop %d -> %d:\n\t", var, cprop_var); + td->var_values [cprop_var].ref_count++; + *pvar = cprop_var; + if (td->verbose_level) + interp_dump_ins (ins, td->data_items); } else { - gboolean can_cprop = FALSE; - // If var is fixed ssa, we can extend liveness if it doesn't overlap with other renamed - // vars. If the var is not ssa, we do cprop only within the same bblock. - if (var_is_ssa_form (td, cprop_var)) { - can_cprop = can_extend_var_liveness (td, cprop_var, current_liveness); - } else { - InterpVarValue *cprop_var_val = get_var_value (td, cprop_var); - gboolean var_def_in_cur_bb = val->liveness.bb_index == td->cbb->index; - if (!var_def_in_cur_bb) { - // var definition was not in current bblock so it might no longer contain - // the current value of cprop_var because cprop_var is not in ssa form and - // we don't keep track its value over multiple basic blocks - can_cprop = FALSE; - } else if (!cprop_var_val) { - // Previously in this bblock, var is recorded as having the value of cprop_var and - // cprop_var is not defined in the current bblock. This means that var will still - // contain the value of cprop_var - can_cprop = TRUE; - } else { - // Previously in this bblock, var is recorded as having the value of cprop_var and - // cprop_var is defined in the current bblock. This means that var will contain the - // value of cprop_var only if last known cprop_var redefinition was before the var definition. - g_assert (cprop_var_val->liveness.bb_index == val->liveness.bb_index); - can_cprop = cprop_var_val->liveness.ins_index < val->liveness.ins_index; - } - } - - if (can_cprop) { - if (td->verbose_level) - g_print ("cprop %d -> %d:\n\t", var, cprop_var); - td->var_values [cprop_var].ref_count++; - *pvar = cprop_var; - if (td->verbose_level) - interp_dump_ins (ins, td->data_items); - } else { - val->ref_count++; - } + val->ref_count++; } } else { td->var_values [var].ref_count++; @@ -3232,6 +3238,39 @@ get_unop_condbr_sp (int opcode) } } +// We have the pattern of: +// +// var <- def (v1, v2, ..) +// ... +// use var +// +// We want to optimize out `var <- def` and replace `use var` with `use v1, v2, ...` in a super instruction. +// This can be done only if var is used only once (otherwise `var <- def` will remain alive and in the +// superinstruction we duplicate the calculation of var) and v1, v2, .. can have their liveness extended +// to the current liveness +static gboolean +can_propagate_var_def (TransformData *td, int var, InterpLivenessPosition cur_liveness) +{ + InterpVarValue *val = get_var_value (td, var); + if (!val) + return FALSE; + if (val->ref_count != 1) + return FALSE; + + InterpInst *def = val->def; + int num_sregs = mono_interp_op_sregs [def->opcode]; + + for (int i = 0; i < num_sregs; i++) { + int svar = def->sregs [i]; + if (svar == MINT_CALL_ARGS_SREG) + return FALSE; // We don't care for these in super instructions + + if (!can_extend_var_liveness (td, svar, val->liveness, cur_liveness)) + return FALSE; + } + return TRUE; +} + static void interp_super_instructions (TransformData *td) { @@ -3244,8 +3283,13 @@ interp_super_instructions (TransformData *td) // Set cbb since we do some instruction inserting below td->cbb = bb; int noe = bb->native_offset_estimate; + InterpLivenessPosition current_liveness; + current_liveness.bb_index = bb->index; + current_liveness.ins_index = 0; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { int opcode = ins->opcode; + if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) + current_liveness.ins_index++; if (MINT_IS_NOP (opcode)) continue; @@ -3253,7 +3297,7 @@ interp_super_instructions (TransformData *td) InterpVarValue *dval = &td->var_values [ins->dreg]; dval->type = VAR_VALUE_NONE; dval->def = ins; - dval->liveness.bb_index = bb->index; // only to check if defined in current bblock + dval->liveness = current_liveness; } if (opcode == MINT_RET || (opcode >= MINT_RET_I1 && opcode <= MINT_RET_U2)) { // ldc + ret -> ret.imm @@ -3327,8 +3371,8 @@ interp_super_instructions (TransformData *td) } } else if (opcode == MINT_MUL_I4_IMM || opcode == MINT_MUL_I8_IMM) { int sreg = ins->sregs [0]; - InterpInst *def = get_var_value_def (td, sreg); - if (def != NULL && td->var_values [sreg].ref_count == 1) { + if (can_propagate_var_def (td, sreg, current_liveness)) { + InterpInst *def = get_var_value_def (td, sreg); gboolean is_i4 = opcode == MINT_MUL_I4_IMM; if ((is_i4 && def->opcode == MINT_ADD_I4_IMM) || (!is_i4 && def->opcode == MINT_ADD_I8_IMM)) { @@ -3435,8 +3479,8 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = get_var_value_def (td, sreg_base); - if (def != NULL && td->var_values [sreg_base].ref_count == 1) { + if (can_propagate_var_def (td, sreg_base, current_liveness)) { + InterpInst *def = get_var_value_def (td, sreg_base); InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { int ldind_offset_op = MINT_LDIND_OFFSET_I1 + (opcode - MINT_LDIND_I1); @@ -3464,8 +3508,8 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_LDIND_OFFSET (opcode)) { int sreg_off = ins->sregs [1]; - InterpInst *def = get_var_value_def (td, sreg_off); - if (def != NULL && td->var_values [sreg_off].ref_count == 1) { + if (can_propagate_var_def (td, sreg_off, current_liveness)) { + InterpInst *def = get_var_value_def (td, sreg_off); if (def->opcode == MINT_MUL_P_IMM || def->opcode == MINT_ADD_P_IMM || def->opcode == MINT_ADD_MUL_P_IMM) { int ldind_offset_op = MINT_LDIND_OFFSET_ADD_MUL_IMM_I1 + (opcode - MINT_LDIND_OFFSET_I1); InterpInst *new_inst = interp_insert_ins (td, ins, ldind_offset_op); @@ -3501,8 +3545,8 @@ interp_super_instructions (TransformData *td) } } else if (MINT_IS_STIND_INT (opcode)) { int sreg_base = ins->sregs [0]; - InterpInst *def = get_var_value_def (td, sreg_base); - if (def != NULL && td->var_values [sreg_base].ref_count == 1) { + if (can_propagate_var_def (td, sreg_base, current_liveness)) { + InterpInst *def = get_var_value_def (td, sreg_base); InterpInst *new_inst = NULL; if (def->opcode == MINT_ADD_P) { int stind_offset_op = MINT_STIND_OFFSET_I1 + (opcode - MINT_STIND_I1); @@ -3584,8 +3628,8 @@ interp_super_instructions (TransformData *td) if (opcode == MINT_BRFALSE_I4 || opcode == MINT_BRTRUE_I4) { gboolean negate = opcode == MINT_BRFALSE_I4; int cond_sreg = ins->sregs [0]; - InterpInst *def = get_var_value_def (td, cond_sreg); - if (def != NULL && td->var_values [cond_sreg].ref_count == 1) { + if (can_propagate_var_def (td, cond_sreg, current_liveness)) { + InterpInst *def = get_var_value_def (td, cond_sreg); int replace_opcode = -1; switch (def->opcode) { case MINT_CEQ_I4: replace_opcode = negate ? MINT_BNE_UN_I4 : MINT_BEQ_I4; break; From 4f776d362901fbf7544a41b1cf50cee685bb7dd0 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Tue, 9 Jan 2024 20:26:15 +0200 Subject: [PATCH 40/45] [mono][interp] Refactor dfs traversal to be iterative This doesn't really generate the same ordering, but we preserve the rule that the root bblock is marked before all successors, unless back edges are involved. --- src/mono/mono/mini/interp/transform-opt.c | 44 +++++++++++++++-------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index f67aafb7dd521a..e1f162468c8bbd 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -527,29 +527,45 @@ interp_get_bb_links (InterpBasicBlock *bb) return str; } -static void -dfs_visit (InterpBasicBlock *bb, int *pos, InterpBasicBlock **bb_array) +static int +dfs_visit (TransformData *td) { - int dfs_index = *pos; + int dfs_index = 0; + int next_stack_index = 0; + td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + InterpBasicBlock **stack = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); - bb_array [dfs_index] = bb; - bb->dfs_index = dfs_index; - *pos = dfs_index + 1; - for (int i = 0; i < bb->out_count; i++) { - InterpBasicBlock *out_bb = bb->out_bb [i]; - if (out_bb->dfs_index == -1) - dfs_visit (out_bb, pos, bb_array); + g_assert (!td->entry_bb->in_count); + stack [next_stack_index++] = td->entry_bb; + + while (next_stack_index > 0) { + // Pop last added element + next_stack_index--; + InterpBasicBlock *bb = stack [next_stack_index]; + + // Process current bblock + td->bblocks [dfs_index] = bb; + bb->dfs_index = dfs_index++; + + // Push all nodes to process next + for (int i = 0; i < bb->out_count; i++) { + InterpBasicBlock *out_bb = bb->out_bb [i]; + if (out_bb->dfs_index == -1) { + stack [next_stack_index++] = out_bb; + // Mark node as gray so it is not pushed again + out_bb->dfs_index = -2; + } + } } + + return dfs_index; } static void interp_compute_dfs_indexes (TransformData *td) { - int dfs_index = 0; // Sort bblocks in reverse postorder - td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); - g_assert (!td->entry_bb->in_count); - dfs_visit (td->entry_bb, &dfs_index, td->bblocks); + int dfs_index = dfs_visit (td); td->bblocks_count = dfs_index; // Visit also bblocks reachable from eh handlers. These bblocks are not linked From ff8ef69c8b07195b54e9d16af0d78f2591c0a7c1 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 10 Jan 2024 12:20:45 +0200 Subject: [PATCH 41/45] [mono][interp] Make rename vars pass non-recursive The recursive algorithm was doing preliminary operations on a bblock, then recursively processed all successor subtrees, and then doing finishing work on the bblock. We implement this iteratively by using a stack where we push each bblock twice. After we finish with the preliminary operation on the bblock, we push it again on the stack and then push for the first time each of the successors. This second push entry will only be reached once the stack is consumed, aka all successors are completely processed. --- src/mono/mono/mini/interp/transform-opt.c | 44 ++++++++++++++++++----- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index e1f162468c8bbd..0a9ab2a2db21f2 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -1272,7 +1272,7 @@ rename_phi_args_in_out_bbs (TransformData *td, InterpBasicBlock *bb) } static void -rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) +rename_vars_in_bb_start (TransformData *td, InterpBasicBlock *bb) { InterpInst *ins; @@ -1330,14 +1330,12 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) } rename_phi_args_in_out_bbs (td, bb); +} - // Rename recursively every successor of bb in the dominator tree - GSList *dominated = bb->dominated; - while (dominated) { - InterpBasicBlock *dominated_bb = (InterpBasicBlock*)dominated->data; - rename_vars_in_bb (td, dominated_bb); - dominated = dominated->next; - } +static void +rename_vars_in_bb_end (TransformData *td, InterpBasicBlock *bb) +{ + InterpInst *ins; // All vars currently on the ssa stack are live until the end of the bblock for (unsigned int i = 0; i < td->renamable_vars_size; i++) { @@ -1379,12 +1377,40 @@ rename_vars_in_bb (TransformData *td, InterpBasicBlock *bb) interp_clear_ins (ins); } } + } static void rename_vars (TransformData *td) { - rename_vars_in_bb (td, td->entry_bb); + int next_stack_index = 0; + InterpBasicBlock **stack = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); + gboolean *bb_status = (gboolean*)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); + + stack [next_stack_index++] = td->entry_bb; + + while (next_stack_index > 0) { + next_stack_index--; + InterpBasicBlock *bb = stack [next_stack_index]; + + if (!bb_status [bb->dfs_index]) { + rename_vars_in_bb_start (td, bb); + bb_status [bb->dfs_index] = TRUE; + stack [next_stack_index++] = bb; + + // Rename recursively every successor of bb in the dominator tree + GSList *dominated = bb->dominated; + while (dominated) { + InterpBasicBlock *dominated_bb = (InterpBasicBlock*)dominated->data; + g_assert (!bb_status [dominated_bb->dfs_index]); + stack [next_stack_index++] = dominated_bb; + dominated = dominated->next; + } + } else { + // We reach this entry after all the successors have been processed + rename_vars_in_bb_end (td, bb); + } + } if (td->verbose_level) { g_print ("\nFIXED SSA VARS LIVENESS LIMIT:\n"); From ee0e1480ad4864833e23cfc88c0dd9d690d3cf51 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Wed, 10 Jan 2024 18:49:50 +0200 Subject: [PATCH 42/45] [mono][interp] Make use of dfs_index in more places We have the global `td->bb_count` index, which is an ever increasing counter for each allocated bblocks. `bb->dfs_index` and `td->bblocks_count` only take into account reachable basic blocks. A lot of SSA algorithms still used the `td->bb_count` limit, which is excessively high. Further reduce mem use of bitsets by using `dfs_index` into liveness position. This requires setting `dfs_index` for bblocks even in no-ssa case. As an example, in one of the massive methods in our test suite, this reduces the mempool size from 2.4GB down to 100MB. --- src/mono/mono/mini/interp/transform-opt.c | 46 ++++++++++++----------- src/mono/mono/mini/interp/transform.c | 1 - src/mono/mono/mini/interp/transform.h | 2 +- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 0a9ab2a2db21f2..a70b377b84786a 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -564,6 +564,8 @@ dfs_visit (TransformData *td) static void interp_compute_dfs_indexes (TransformData *td) { + for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) + bb->dfs_index = -1; // Sort bblocks in reverse postorder int dfs_index = dfs_visit (td); td->bblocks_count = dfs_index; @@ -620,7 +622,7 @@ is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) static void interp_compute_dominators (TransformData *td) { - InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); idoms [0] = td->entry_bb; gboolean changed = TRUE; @@ -695,11 +697,11 @@ interp_compute_dominators (TransformData *td) static void interp_compute_dominance_frontier (TransformData *td) { - int bitsize = mono_bitset_alloc_size (td->bb_count, 0); - char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bb_count); + int bitsize = mono_bitset_alloc_size (td->bblocks_count, 0); + char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bblocks_count); for (int i = 0; i < td->bblocks_count; i++) { - td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bb_count, 0); + td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bblocks_count, 0); mem += bitsize; } @@ -857,7 +859,7 @@ get_var_value (TransformData *td, int var) // No ssa var, check if we have a def set for the current bblock if (td->var_values [var].def) { - if (td->var_values [var].liveness.bb_index == td->cbb->index) + if (td->var_values [var].liveness.bb_dfs_index == td->cbb->dfs_index) return &td->var_values [var]; } return NULL; @@ -973,7 +975,7 @@ static void compute_gen_kill_sets (TransformData *td) { int bitsize = mono_bitset_alloc_size (td->renamable_vars_size, 0); - char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bb_count * 4); + char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bblocks_count * 4); for (int i = 0; i < td->bblocks_count; i++) { InterpBasicBlock *bb = td->bblocks [i]; @@ -1294,7 +1296,7 @@ rename_vars_in_bb_start (TransformData *td, InterpBasicBlock *bb) } InterpLivenessPosition current_liveness; - current_liveness.bb_index = bb->index; + current_liveness.bb_dfs_index = bb->dfs_index; current_liveness.ins_index = 0; // Use renamed definition for sources @@ -1345,11 +1347,11 @@ rename_vars_in_bb_end (TransformData *td, InterpBasicBlock *bb) g_assert (td->vars [renamed_var].renamed_ssa_fixed); int renamed_var_ext = td->vars [renamed_var].ext_index; if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { - gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bb_count, 0)); - td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bb_count, 0); + gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bblocks_count, 0)); + td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bblocks_count, 0); } - mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->index); + mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->dfs_index); } } } @@ -1419,8 +1421,8 @@ rename_vars (TransformData *td) MonoBitSet *live_out_bblocks = td->renamed_fixed_vars [i].live_out_bblocks; if (live_out_bblocks) { int j; - mono_bitset_foreach_bit (live_out_bblocks, j, td->bb_count) { - g_print (" BB%d", j); + mono_bitset_foreach_bit (live_out_bblocks, j, td->bblocks_count) { + g_print (" BB%d", td->bblocks [j]->index); } } g_print (" }\n"); @@ -1428,7 +1430,7 @@ rename_vars (TransformData *td) GSList *live_limit_bblocks = td->renamed_fixed_vars [i].live_limit_bblocks; while (live_limit_bblocks) { InterpLivenessPosition *live_limit = (InterpLivenessPosition*)live_limit_bblocks->data; - g_print (" (BB%d, %d)", live_limit->bb_index, live_limit->ins_index); + g_print (" (BB%d, %d)", td->bblocks [live_limit->bb_dfs_index]->index, live_limit->ins_index); live_limit_bblocks = live_limit_bblocks->next; } g_print (" }\n"); @@ -1517,7 +1519,6 @@ interp_exit_ssa (TransformData *td) g_slist_free (bb->dominated); bb->dominated = NULL; } - bb->dfs_index = -1; bb->gen_set = NULL; bb->kill_set = NULL; bb->live_in_set = NULL; @@ -2541,13 +2542,13 @@ can_extend_ssa_var_liveness (TransformData *td, int var, InterpLivenessPosition InterpRenamedFixedVar *fixed_var_ext = &td->renamed_fixed_vars [td->vars [var].ext_index]; // If var was already live at the end of this bblocks, there is no liveness extension happening - if (fixed_var_ext->live_out_bblocks && mono_bitset_test_fast (fixed_var_ext->live_out_bblocks, cur_liveness.bb_index)) + if (fixed_var_ext->live_out_bblocks && mono_bitset_test_fast (fixed_var_ext->live_out_bblocks, cur_liveness.bb_dfs_index)) return TRUE; GSList *bb_liveness = fixed_var_ext->live_limit_bblocks; while (bb_liveness) { InterpLivenessPosition *liveness_limit = (InterpLivenessPosition*)bb_liveness->data; - if (cur_liveness.bb_index == liveness_limit->bb_index) { + if (cur_liveness.bb_dfs_index == liveness_limit->bb_dfs_index) { if (cur_liveness.ins_index <= liveness_limit->ins_index) return TRUE; else @@ -2571,7 +2572,7 @@ can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition orig // vars. If var is normal ssa, we can extend its liveness with no constraints return can_extend_ssa_var_liveness (td, var, cur_liveness); } else { - gboolean original_in_curbb = original_liveness.bb_index == td->cbb->index; + gboolean original_in_curbb = original_liveness.bb_dfs_index == td->cbb->dfs_index; if (!original_in_curbb) { // var is not in ssa form and we only track its value within a single bblock. // The original liveness information is not in cbb and, by the time we get to cbb, @@ -2588,7 +2589,7 @@ can_extend_var_liveness (TransformData *td, int var, InterpLivenessPosition orig // We know that var is alive at original_liveness, which is in cbb, and that // the var has been redefined in cbb. We can extend its liveness to cur_liveness, // only if it hasn't been redefined between original and cur liveness. - g_assert (var_val->liveness.bb_index == original_liveness.bb_index); + g_assert (var_val->liveness.bb_dfs_index == original_liveness.bb_dfs_index); return var_val->liveness.ins_index < original_liveness.ins_index; } } @@ -2671,7 +2672,7 @@ can_cprop_dreg (TransformData *td, InterpInst *mov_ins) if (!sreg_val) return FALSE; // We only apply this optimization if the definition is in the same bblock as this use - if (sreg_val->liveness.bb_index != td->cbb->index) + if (sreg_val->liveness.bb_dfs_index != td->cbb->dfs_index) return FALSE; if (td->var_values [sreg].def->opcode == MINT_DEF_ARG) return FALSE; @@ -2685,7 +2686,7 @@ can_cprop_dreg (TransformData *td, InterpInst *mov_ins) // check if dreg is a renamed ssa fixed var (likely to remain alive) if (td->vars [dreg].renamed_ssa_fixed && !td->vars [sreg].renamed_ssa_fixed) { InterpLivenessPosition last_use_liveness = td->renamable_vars [td->renamed_fixed_vars [td->vars [dreg].ext_index].renamable_var_ext_index].last_use_liveness; - if (last_use_liveness.bb_index != td->cbb->index || + if (last_use_liveness.bb_dfs_index != td->cbb->dfs_index || sreg_val->liveness.ins_index >= last_use_liveness.ins_index) { // No other conflicting renamed fixed vars (of dreg) are used in this bblock, or their // last use predates the definition. This means we can tweak def of sreg to store directly @@ -2728,7 +2729,7 @@ interp_cprop (TransformData *td) } InterpLivenessPosition current_liveness; - current_liveness.bb_index = bb->index; + current_liveness.bb_dfs_index = bb->dfs_index; current_liveness.ins_index = 0; // Set cbb since we do some instruction inserting below td->cbb = bb; @@ -3326,7 +3327,7 @@ interp_super_instructions (TransformData *td) td->cbb = bb; int noe = bb->native_offset_estimate; InterpLivenessPosition current_liveness; - current_liveness.bb_index = bb->index; + current_liveness.bb_dfs_index = bb->dfs_index; current_liveness.ins_index = 0; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { int opcode = ins->opcode; @@ -3795,6 +3796,7 @@ interp_prepare_no_ssa_opt (TransformData *td) int i = 0; for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { td->bblocks [i] = bb; + bb->dfs_index = i; i++; } td->bblocks_count = 0; diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 28522da0228542..0cbba8a2ff680d 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -3881,7 +3881,6 @@ interp_alloc_bb (TransformData *td) bb->native_offset = -1; bb->stack_height = -1; bb->index = td->bb_count++; - bb->dfs_index = -1; return bb; } diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index a49c53a7f6f2bb..6cd3608c5020e7 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -47,7 +47,7 @@ typedef struct #define VAR_VALUE_COUNT 6 typedef struct { - guint32 bb_index; + guint32 bb_dfs_index; guint32 ins_index; } InterpLivenessPosition; From 0b9ad9ec254612ab315991d5e5f553a24ee6235c Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 11 Jan 2024 17:27:37 +0200 Subject: [PATCH 43/45] [mono][interp] Don't run any optimizations if we have cprop disabled Before this commit, with all optimization disabled, some optimizations (that were dependent on cprop) were still being run. --- src/mono/mono/mini/interp/transform-opt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index a70b377b84786a..82bbe021e5f9bf 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -3841,6 +3841,10 @@ interp_optimize_code (TransformData *td) if (mono_interp_opt & INTERP_OPT_BBLOCKS) MONO_TIME_TRACK (mono_interp_stats.optimize_bblocks_time, interp_optimize_bblocks (td)); + // Nothing to optimize if we don't have cprop enabled + if (!(mono_interp_opt & INTERP_OPT_CPROP)) + return; + if (!(mono_interp_opt & INTERP_OPT_SSA)) td->disable_ssa = TRUE; @@ -3864,15 +3868,13 @@ interp_optimize_code (TransformData *td) else MONO_TIME_TRACK (mono_interp_stats.ssa_compute_time, interp_compute_ssa (td)); - if (mono_interp_opt & INTERP_OPT_CPROP) - MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); + MONO_TIME_TRACK (mono_interp_stats.cprop_time, interp_cprop (td)); interp_var_deadce (td); // We run this after var deadce to detect more single use vars. This pass will clear // unnecessary instruction on the fly so deadce is no longer needed to run. - if ((mono_interp_opt & INTERP_OPT_SUPER_INSTRUCTIONS) && - (mono_interp_opt & INTERP_OPT_CPROP)) + if (mono_interp_opt & INTERP_OPT_SUPER_INSTRUCTIONS) MONO_TIME_TRACK (mono_interp_stats.super_instructions_time, interp_super_instructions (td)); if (!td->disable_ssa) From 67b089ae6785ddaa421aa00557185d5616cf6b6e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 12 Jan 2024 11:13:53 +0200 Subject: [PATCH 44/45] [mono][interp] Reduce max memory usage during interp compilation Create a new mempool for optimization iteration. For complex methods we typically do multiple optimization iterations. For each iteration we were allocating memory from the global method mempool, which was freed at the end of method compilation. We now allocate most of the memory inside a separate mempool which is freed at the end of each iteration. Allocate some of the temporary data with malloc so we can free it earlier. As an exception we allocate also var_values with malloc. We don't allocate it to the global mempool since this table is very big and it is wasteful. We don't allocate it to the optimization mempool since it is used for better code gen outside of optimization passes, in the var offset allocator. --- src/mono/mono/mini/interp/transform-opt.c | 45 +++++++++++++++-------- src/mono/mono/mini/interp/transform.c | 6 +++ src/mono/mono/mini/interp/transform.h | 1 + 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index 82bbe021e5f9bf..cbc76ff9571417 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -532,8 +532,8 @@ dfs_visit (TransformData *td) { int dfs_index = 0; int next_stack_index = 0; - td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); - InterpBasicBlock **stack = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->opt_mempool, sizeof (InterpBasicBlock*) * td->bb_count); + InterpBasicBlock **stack = (InterpBasicBlock**)g_malloc0 (sizeof (InterpBasicBlock*) * td->bb_count); g_assert (!td->entry_bb->in_count); stack [next_stack_index++] = td->entry_bb; @@ -558,6 +558,7 @@ dfs_visit (TransformData *td) } } + g_free (stack); return dfs_index; } @@ -622,7 +623,7 @@ is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) static void interp_compute_dominators (TransformData *td) { - InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); + InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->opt_mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); idoms [0] = td->entry_bb; gboolean changed = TRUE; @@ -698,7 +699,7 @@ static void interp_compute_dominance_frontier (TransformData *td) { int bitsize = mono_bitset_alloc_size (td->bblocks_count, 0); - char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bblocks_count); + char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count); for (int i = 0; i < td->bblocks_count; i++) { td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bblocks_count, 0); @@ -975,7 +976,7 @@ static void compute_gen_kill_sets (TransformData *td) { int bitsize = mono_bitset_alloc_size (td->renamable_vars_size, 0); - char *mem = (char *)mono_mempool_alloc0 (td->mempool, bitsize * td->bblocks_count * 4); + char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count * 4); for (int i = 0; i < td->bblocks_count; i++) { InterpBasicBlock *bb = td->bblocks [i]; @@ -1094,7 +1095,7 @@ bb_insert_phi (TransformData *td, InterpBasicBlock *bb, int var) g_print ("BB%d NEW_PHI %d\n", bb->index, var); phi->dreg = var; - phi->info.args = (int*)mono_mempool_alloc (td->mempool, (bb->in_count + 1) * sizeof (int)); + phi->info.args = (int*)mono_mempool_alloc (td->opt_mempool, (bb->in_count + 1) * sizeof (int)); int i; for (i = 0; i < bb->in_count; i++) phi->info.args [i] = var; @@ -1109,7 +1110,7 @@ bb_insert_dead_phi (TransformData *td, InterpBasicBlock *bb, int var) bitset = bb->first_ins->info.dead_phi_vars; } else { InterpInst *phi = interp_insert_ins_bb (td, bb, NULL, MINT_DEAD_PHI); - gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->renamable_vars_size, 0)); + gpointer mem = mono_mempool_alloc0 (td->opt_mempool, mono_bitset_alloc_size (td->renamable_vars_size, 0)); phi->info.dead_phi_vars = bitset = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); } int ext_index = td->vars [var].ext_index; @@ -1322,7 +1323,7 @@ rename_vars_in_bb_start (TransformData *td, InterpBasicBlock *bb) if (renamed_var != -1) { g_assert (td->vars [renamed_var].renamed_ssa_fixed); int renamed_var_ext = td->vars [renamed_var].ext_index; - InterpLivenessPosition *liveness_ptr = (InterpLivenessPosition*)mono_mempool_alloc (td->mempool, sizeof (InterpLivenessPosition)); + InterpLivenessPosition *liveness_ptr = (InterpLivenessPosition*)mono_mempool_alloc (td->opt_mempool, sizeof (InterpLivenessPosition)); *liveness_ptr = current_liveness; td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks = g_slist_prepend (td->renamed_fixed_vars [renamed_var_ext].live_limit_bblocks, liveness_ptr); } @@ -1347,7 +1348,7 @@ rename_vars_in_bb_end (TransformData *td, InterpBasicBlock *bb) g_assert (td->vars [renamed_var].renamed_ssa_fixed); int renamed_var_ext = td->vars [renamed_var].ext_index; if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { - gpointer mem = mono_mempool_alloc0 (td->mempool, mono_bitset_alloc_size (td->bblocks_count, 0)); + gpointer mem = mono_mempool_alloc0 (td->opt_mempool, mono_bitset_alloc_size (td->bblocks_count, 0)); td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bblocks_count, 0); } @@ -1386,8 +1387,8 @@ static void rename_vars (TransformData *td) { int next_stack_index = 0; - InterpBasicBlock **stack = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); - gboolean *bb_status = (gboolean*)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); + InterpBasicBlock **stack = (InterpBasicBlock**)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count); + gboolean *bb_status = (gboolean*)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count); stack [next_stack_index++] = td->entry_bb; @@ -1414,6 +1415,9 @@ rename_vars (TransformData *td) } } + g_free (stack); + g_free (bb_status); + if (td->verbose_level) { g_print ("\nFIXED SSA VARS LIVENESS LIMIT:\n"); for (unsigned int i = 0; i < td->renamed_fixed_vars_size; i++) { @@ -1718,7 +1722,7 @@ interp_link_bblocks (TransformData *td, InterpBasicBlock *from, InterpBasicBlock static void interp_mark_reachable_bblocks (TransformData *td) { - InterpBasicBlock **queue = mono_mempool_alloc0 (td->mempool, td->bb_count * sizeof (InterpBasicBlock*)); + InterpBasicBlock **queue = g_malloc0 (td->bb_count * sizeof (InterpBasicBlock*)); InterpBasicBlock *current; int cur_index = 0; int next_position = 0; @@ -1761,6 +1765,8 @@ interp_mark_reachable_bblocks (TransformData *td) if (needs_retry) goto retry; } + + g_free (queue); } /** @@ -2714,7 +2720,7 @@ interp_cprop (TransformData *td) // FIXME // There is no need to zero, if we pay attention to phi args vars. They // can be used before the definition. - td->var_values = (InterpVarValue*) mono_mempool_alloc0 (td->mempool, td->vars_size * sizeof (InterpVarValue)); + td->var_values = (InterpVarValue*) g_malloc0 (td->vars_size * sizeof (InterpVarValue)); // Traverse in dfs order. This guarantees that we always reach the definition first before the // use of the var. Exception is only for phi nodes, where we don't care about the definition @@ -3790,8 +3796,7 @@ interp_prepare_no_ssa_opt (TransformData *td) td->vars [i].has_indirects = (td->vars [i].indirects > 0) ? TRUE : FALSE; } - if (!td->bblocks) - td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->mempool, sizeof (InterpBasicBlock*) * td->bb_count); + td->bblocks = (InterpBasicBlock**)mono_mempool_alloc0 (td->opt_mempool, sizeof (InterpBasicBlock*) * td->bb_count); int i = 0; for (InterpBasicBlock *bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { @@ -3861,6 +3866,14 @@ interp_optimize_code (TransformData *td) g_print ("Huge method. SSA disabled for first iteration\n"); } optimization_retry: + if (td->opt_mempool != NULL) + mono_mempool_destroy (td->opt_mempool); + if (td->var_values != NULL) { + g_free (td->var_values); + td->var_values = NULL; + } + td->opt_mempool = mono_mempool_new (); + td->need_optimization_retry = FALSE; if (td->disable_ssa) @@ -3897,6 +3910,8 @@ interp_optimize_code (TransformData *td) goto optimization_retry; } + mono_mempool_destroy (td->opt_mempool); + if (td->verbose_level) { g_print ("\nOptimized IR:\n"); mono_interp_print_td_code (td); diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index 0cbba8a2ff680d..a49a56d3a11c0c 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -9010,6 +9010,12 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG if (td->optimized) { MONO_TIME_TRACK (mono_interp_stats.optimize_time, interp_optimize_code (td)); interp_alloc_offsets (td); + // Offset allocator uses computed ref counts from var values. We have to free this + // table later here. + if (td->var_values != NULL) { + g_free (td->var_values); + td->var_values = NULL; + } #if HOST_BROWSER if (mono_interp_opt & INTERP_OPT_JITERPRETER) jiterp_insert_entry_points (rtm, td); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 6cd3608c5020e7..46c119ab3a34d7 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -328,6 +328,7 @@ typedef struct InterpBasicBlock **idoms; // immediate dominator for each bblock, index from reverse postorder dfs int bb_count; MonoMemPool *mempool; + MonoMemPool *opt_mempool; MonoMemoryManager *mem_manager; GList *basic_blocks; GPtrArray *relocs; From ad708848cea05700d8441ea044ccb3f355c80948 Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Fri, 26 Jan 2024 12:51:18 +0200 Subject: [PATCH 45/45] PR review --- src/mono/mono/mini/interp/transform-opt.c | 52 +++++++++++------------ src/mono/mono/mini/interp/transform.c | 17 +++++--- src/mono/mono/mini/interp/transform.h | 4 +- 3 files changed, 38 insertions(+), 35 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-opt.c b/src/mono/mono/mini/interp/transform-opt.c index cbc76ff9571417..9933dbcc09a7ad 100644 --- a/src/mono/mono/mini/interp/transform-opt.c +++ b/src/mono/mono/mini/interp/transform-opt.c @@ -569,7 +569,7 @@ interp_compute_dfs_indexes (TransformData *td) bb->dfs_index = -1; // Sort bblocks in reverse postorder int dfs_index = dfs_visit (td); - td->bblocks_count = dfs_index; + td->bblocks_count_no_eh = dfs_index; // Visit also bblocks reachable from eh handlers. These bblocks are not linked // to the main cfg (where we do dominator computation, ssa transformation etc) @@ -591,7 +591,7 @@ interp_compute_dfs_indexes (TransformData *td) g_print ("\nBASIC BLOCK GRAPH:\n"); for (bb = td->entry_bb; bb != NULL; bb = bb->next_bb) { GString* bb_info = interp_get_bb_links (bb); - g_print ("BB%d: DFS%s(%d), %s\n", bb->index, (bb->dfs_index >= td->bblocks_count) ? "_EH" : "" , bb->dfs_index, bb_info->str); + g_print ("BB%d: DFS%s(%d), %s\n", bb->index, (bb->dfs_index >= td->bblocks_count_no_eh) ? "_EH" : "" , bb->dfs_index, bb_info->str); g_string_free (bb_info, TRUE); } } @@ -615,7 +615,7 @@ is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) // bblocks with uninitialized dfs_index are unreachable if (bb->dfs_index == -1) return FALSE; - if (bb->dfs_index < td->bblocks_count) + if (bb->dfs_index < td->bblocks_count_no_eh) return TRUE; return FALSE; } @@ -623,14 +623,14 @@ is_bblock_ssa_cfg (TransformData *td, InterpBasicBlock *bb) static void interp_compute_dominators (TransformData *td) { - InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->opt_mempool, sizeof (InterpBasicBlock*) * td->bblocks_count); + InterpBasicBlock **idoms = (InterpBasicBlock**)mono_mempool_alloc0 (td->opt_mempool, sizeof (InterpBasicBlock*) * td->bblocks_count_no_eh); idoms [0] = td->entry_bb; gboolean changed = TRUE; while (changed) { changed = FALSE; // all bblocks in reverse post order except entry - for (int i = 1; i < td->bblocks_count; i++) { + for (int i = 1; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; InterpBasicBlock *new_idom = NULL; // pick candidate idom from first processed predecessor of it @@ -661,7 +661,7 @@ interp_compute_dominators (TransformData *td) td->idoms = idoms; // Build `dominated` bblock list for each bblock - for (int i = 1; i < td->bblocks_count; i++) { + for (int i = 1; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; InterpBasicBlock *idom = td->idoms [i]; if (idom) @@ -698,15 +698,15 @@ interp_compute_dominators (TransformData *td) static void interp_compute_dominance_frontier (TransformData *td) { - int bitsize = mono_bitset_alloc_size (td->bblocks_count, 0); - char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count); + int bitsize = mono_bitset_alloc_size (td->bblocks_count_no_eh, 0); + char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count_no_eh); - for (int i = 0; i < td->bblocks_count; i++) { - td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bblocks_count, 0); + for (int i = 0; i < td->bblocks_count_no_eh; i++) { + td->bblocks [i]->dfrontier = mono_bitset_mem_new (mem, td->bblocks_count_no_eh, 0); mem += bitsize; } - for (int i = 0; i < td->bblocks_count; i++) { + for (int i = 0; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; if (bb->in_count > 1) { @@ -718,7 +718,7 @@ interp_compute_dominance_frontier (TransformData *td) g_assert (p->dfs_index || p == td->entry_bb); while (p != td->idoms [bb->dfs_index]) { - g_assert (bb->dfs_index < td->bblocks_count); + g_assert (bb->dfs_index < td->bblocks_count_no_eh); mono_bitset_set_fast (p->dfrontier, bb->dfs_index); p = td->idoms [p->dfs_index]; } @@ -779,7 +779,7 @@ interp_compute_eh_vars (TransformData *td) // EH bblocks are stored separately and are not reachable from the non-EF control flow // path. Any var reachable from EH bblocks will not be in SSA form. - for (int i = td->bblocks_count; i < td->bblocks_count_eh; i++) { + for (int i = td->bblocks_count_no_eh; i < td->bblocks_count_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { if (ins->opcode == MINT_LDLOCA_S) @@ -916,7 +916,7 @@ interp_compute_global_vars (TransformData *td) if (!var_data->declare_bbs) { var_data->declare_bbs = g_slist_prepend (NULL, bb); } else { - int ext_index = interp_create_renamable_var (td, ins->dreg); + int ext_index = interp_make_var_renamable (td, ins->dreg); if (!g_slist_find (var_data->declare_bbs, bb)) { // Var defined in multiple bblocks, it is ssa global var_data->declare_bbs = g_slist_prepend (var_data->declare_bbs, bb); @@ -976,9 +976,9 @@ static void compute_gen_kill_sets (TransformData *td) { int bitsize = mono_bitset_alloc_size (td->renamable_vars_size, 0); - char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count * 4); + char *mem = (char *)mono_mempool_alloc0 (td->opt_mempool, bitsize * td->bblocks_count_no_eh * 4); - for (int i = 0; i < td->bblocks_count; i++) { + for (int i = 0; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; bb->gen_set = mono_bitset_mem_new (mem, td->renamable_vars_size, 0); @@ -1032,7 +1032,7 @@ interp_compute_pruned_ssa_liveness (TransformData *td) gboolean changed = TRUE; while (changed) { changed = FALSE; - for (int i = 0; i < td->bblocks_count; i++) { + for (int i = 0; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; guint32 prev_count = mono_bitset_count (bb->live_out_set); recompute_live_out (td, bb); @@ -1165,7 +1165,7 @@ insert_phi_nodes (TransformData *td) static void insert_tiering_defs (TransformData *td) { - for (int i = 0; i < td->bblocks_count; i++) { + for (int i = 0; i < td->bblocks_count_no_eh; i++) { InterpBasicBlock *bb = td->bblocks [i]; if (!bb->patchpoint_bb) continue; @@ -1348,8 +1348,8 @@ rename_vars_in_bb_end (TransformData *td, InterpBasicBlock *bb) g_assert (td->vars [renamed_var].renamed_ssa_fixed); int renamed_var_ext = td->vars [renamed_var].ext_index; if (!td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks) { - gpointer mem = mono_mempool_alloc0 (td->opt_mempool, mono_bitset_alloc_size (td->bblocks_count, 0)); - td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bblocks_count, 0); + gpointer mem = mono_mempool_alloc0 (td->opt_mempool, mono_bitset_alloc_size (td->bblocks_count_no_eh, 0)); + td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks = mono_bitset_mem_new (mem, td->bblocks_count_no_eh, 0); } mono_bitset_set_fast (td->renamed_fixed_vars [renamed_var_ext].live_out_bblocks, bb->dfs_index); @@ -1387,8 +1387,8 @@ static void rename_vars (TransformData *td) { int next_stack_index = 0; - InterpBasicBlock **stack = (InterpBasicBlock**)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count); - gboolean *bb_status = (gboolean*)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count); + InterpBasicBlock **stack = (InterpBasicBlock**)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count_no_eh); + gboolean *bb_status = (gboolean*)g_malloc0 (sizeof (InterpBasicBlock*) * td->bblocks_count_no_eh); stack [next_stack_index++] = td->entry_bb; @@ -1425,7 +1425,7 @@ rename_vars (TransformData *td) MonoBitSet *live_out_bblocks = td->renamed_fixed_vars [i].live_out_bblocks; if (live_out_bblocks) { int j; - mono_bitset_foreach_bit (live_out_bblocks, j, td->bblocks_count) { + mono_bitset_foreach_bit (live_out_bblocks, j, td->bblocks_count_no_eh) { g_print (" BB%d", td->bblocks [j]->index); } } @@ -2744,7 +2744,7 @@ interp_cprop (TransformData *td) gint32 *sregs; gint32 dreg; // LIVENESS_MARKER is set only for non-eh bblocks - if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) + if (bb->dfs_index >= td->bblocks_count_no_eh || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) current_liveness.ins_index++; if (interp_ins_is_nop (ins)) @@ -3337,7 +3337,7 @@ interp_super_instructions (TransformData *td) current_liveness.ins_index = 0; for (InterpInst *ins = bb->first_ins; ins != NULL; ins = ins->next) { int opcode = ins->opcode; - if (bb->dfs_index >= td->bblocks_count || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) + if (bb->dfs_index >= td->bblocks_count_no_eh || bb->dfs_index == -1 || (ins->flags & INTERP_INST_FLAG_LIVENESS_MARKER)) current_liveness.ins_index++; if (MINT_IS_NOP (opcode)) continue; @@ -3804,7 +3804,7 @@ interp_prepare_no_ssa_opt (TransformData *td) bb->dfs_index = i; i++; } - td->bblocks_count = 0; + td->bblocks_count_no_eh = 0; td->bblocks_count_eh = i; } diff --git a/src/mono/mono/mini/interp/transform.c b/src/mono/mono/mini/interp/transform.c index a49a56d3a11c0c..30488e00622fac 100644 --- a/src/mono/mono/mini/interp/transform.c +++ b/src/mono/mono/mini/interp/transform.c @@ -349,10 +349,11 @@ mono_mint_type (MonoType *type) return -1; } -// This doesn't allocate a new var, but marks the existing var as renamable, -// allocating space for additional var data. +// This marks the var as renamable, allocating space for additional data. +// The original var data (InterpVar) will have an index that points to this +// additional data. int -interp_create_renamable_var (TransformData *td, int var) +interp_make_var_renamable (TransformData *td, int var) { // Check if already allocated if (td->vars [var].ext_index != -1) @@ -4876,7 +4877,7 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, g_free (name); } - if (td->optimized) { + if (td->optimized && !td->disable_ssa) { // Add arg defining instructions for SSA machinery for (int i = 0; i < num_args; i++) { interp_add_ins (td, MINT_DEF_ARG); @@ -6225,8 +6226,10 @@ generate_code (TransformData *td, MonoMethod *method, MonoMethodHeader *header, // First arg is dummy var, it is null when passed to the ctor call_args [0] = interp_create_var (td, get_type_from_stack (stack_type [ret_mt], NULL)); - // Make sure this arg is defined for SSA optimizations - interp_add_ins (td, MINT_DEF); + if (!td->disable_ssa) { + // Make sure this arg is defined for SSA optimizations + interp_add_ins (td, MINT_DEF); + } td->last_ins->dreg = call_args [0]; for (int i = 0; i < csignature->param_count; i++) { call_args [i + 1] = td->sp [i].var; @@ -9016,11 +9019,11 @@ generate (MonoMethod *method, MonoMethodHeader *header, InterpMethod *rtm, MonoG g_free (td->var_values); td->var_values = NULL; } + interp_squash_initlocals (td); #if HOST_BROWSER if (mono_interp_opt & INTERP_OPT_JITERPRETER) jiterp_insert_entry_points (rtm, td); #endif - interp_squash_initlocals (td); } generate_compacted_code (rtm, td); diff --git a/src/mono/mono/mini/interp/transform.h b/src/mono/mono/mini/interp/transform.h index 46c119ab3a34d7..a10a517e99ebc5 100644 --- a/src/mono/mono/mini/interp/transform.h +++ b/src/mono/mono/mini/interp/transform.h @@ -323,7 +323,7 @@ typedef struct InterpBasicBlock **offset_to_bb; InterpBasicBlock *entry_bb, *cbb; InterpBasicBlock **bblocks; // ordering of bblocks in reverse postorder dfs - int bblocks_count; + int bblocks_count_no_eh; int bblocks_count_eh; InterpBasicBlock **idoms; // immediate dominator for each bblock, index from reverse postorder dfs int bb_count; @@ -530,7 +530,7 @@ int interp_create_var (TransformData *td, MonoType *type); int -interp_create_renamable_var (TransformData *td, int var); +interp_make_var_renamable (TransformData *td, int var); int interp_create_renamed_fixed_var (TransformData *td, int var_index, int renamable_var_index);