Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support to mono for v128 constants #81902

Merged
merged 7 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/mono/mono/mini/aot-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -7005,6 +7005,13 @@ encode_patch (MonoAotCompile *acfg, MonoJumpInfo *patch_info, guint8 *buf, guint
encode_value (((guint32 *)patch_info->data.target) [MINI_LS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [MINI_MS_WORD_IDX], p, &p);
break;
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT:
encode_value (((guint32 *)patch_info->data.target) [(MINI_LS_WORD_IDX * 2) + MINI_LS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [(MINI_LS_WORD_IDX * 2) + MINI_MS_WORD_IDX], p, &p);
encode_value (((guint32 *)patch_info->data.target) [(MINI_MS_WORD_IDX * 2) + MINI_LS_WORD_IDX], p, &p);
vargaz marked this conversation as resolved.
Show resolved Hide resolved
encode_value (((guint32 *)patch_info->data.target) [(MINI_MS_WORD_IDX * 2) + MINI_MS_WORD_IDX], p, &p);
Comment on lines +7008 to +7013
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, there isn't any SIMD support for BE architectures today.

It's possibly this needs to track the underlying element type and splat each T in memory correctly instead.

break;
case MONO_PATCH_INFO_VTABLE:
case MONO_PATCH_INFO_CLASS:
case MONO_PATCH_INFO_IID:
Expand Down
20 changes: 20 additions & 0 deletions src/mono/mono/mini/aot-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -3842,6 +3842,26 @@ decode_patch (MonoAotModule *aot_module, MonoMemPool *mp, MonoJumpInfo *ji, guin
*(double*)ji->data.target = *(double*)&v;
break;
}
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT: {
guint32 val [4];
guint64 v[2];

// FIXME: Align to 16 bytes ?
ji->data.target = mono_mem_manager_alloc0 (mem_manager, 16);

val [0] = decode_value (p, &p);
val [1] = decode_value (p, &p);
val [2] = decode_value (p, &p);
val [3] = decode_value (p, &p);

v[0] = ((guint64)val [1] << 32) | ((guint64)val [0]);
((guint64*)ji->data.target)[0] = v[0];

v[1] = ((guint64)val [3] << 32) | ((guint64)val [2]);
((guint64*)ji->data.target)[1] = v[1];
break;
}
case MONO_PATCH_INFO_LDSTR:
image = load_image (aot_module, decode_value (p, &p), error);
mono_error_cleanup (error); /* FIXME don't swallow the error */
Expand Down
9 changes: 7 additions & 2 deletions src/mono/mono/mini/cfold.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,15 @@ mono_constant_fold_ins (MonoCompile *cfg, MonoInst *ins, MonoInst *arg1, MonoIns
}
break;
case OP_XMOVE:
if (arg1->opcode == OP_XZERO) {
if ((arg1->opcode == OP_XZERO) || (arg1->opcode == OP_XONES)) {
ALLOC_DEST (cfg, dest, ins);
dest->opcode = OP_XZERO;
dest->opcode = arg1->opcode;
dest->sreg1 = -1;
} else if (arg1->opcode == OP_XCONST) {
ALLOC_DEST (cfg, dest, ins);
dest->opcode = arg1->opcode;
dest->sreg1 = -1;
dest->inst_p0 = arg1->inst_p0;
}
break;
case OP_COMPARE:
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/cpu-amd64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,7 @@ cvttps2dq: dest:x src1:x len:5 clob:1
xmove: dest:x src1:x len:5
xzero: dest:x len:5
xones: dest:x len:5
xconst: dest:x len:12

iconv_to_x: dest:x src1:i len:5
extract_i4: dest:i src1:x len:5
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/cpu-x86.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,7 @@ cvttps2dq: dest:x src1:x len:4 clob:1
xmove: dest:x src1:x len:4
xzero: dest:x len:4
xones: dest:x len:4
xconst: dest:x len:24
tannergooding marked this conversation as resolved.
Show resolved Hide resolved

iconv_to_x: dest:x src1:i len:4
extract_i4: dest:i src1:x len:4
Expand Down
4 changes: 2 additions & 2 deletions src/mono/mono/mini/liveness.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ optimize_initlocals (MonoCompile *cfg)
//printf ("DEAD: "); mono_print_ins (ins);
if (cfg->disable_initlocals_opt_refs && var->type == STACK_OBJ)
continue;
if ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_R4CONST)) {
if ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_R4CONST) || (ins->opcode == OP_XCONST)) {
NULLIFY_INS (ins);
MONO_VARINFO (cfg, var->inst_c0)->spill_costs -= 1;
/*
Expand Down Expand Up @@ -874,7 +874,7 @@ update_liveness2 (MonoCompile *cfg, MonoInst *ins, gboolean set_volatile, int in
}
else {
/* Try dead code elimination */
if (!cfg->disable_deadce_vars && (var != cfg->ret) && !(var->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST)) && !(var->flags & MONO_INST_VOLATILE)) {
if (!cfg->disable_deadce_vars && (var != cfg->ret) && !(var->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && ((ins->opcode == OP_ICONST) || (ins->opcode == OP_I8CONST) || (ins->opcode == OP_R8CONST) || (ins->opcode == OP_XCONST)) && !(var->flags & MONO_INST_VOLATILE)) {
LIVENESS_DEBUG (printf ("\tdead def of R%d, eliminated\n", ins->dreg));
NULLIFY_INS (ins);
return;
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/method-to-ir.c
Original file line number Diff line number Diff line change
Expand Up @@ -12640,6 +12640,7 @@ mono_op_no_side_effects (int opcode)
case OP_VZERO:
case OP_XZERO:
case OP_XONES:
case OP_XCONST:
case OP_ICONST:
case OP_I8CONST:
case OP_ADD_IMM:
Expand Down
29 changes: 25 additions & 4 deletions src/mono/mono/mini/mini-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -7296,6 +7296,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_XONES:
amd64_sse_pcmpeqb_reg_reg (code, ins->dreg, ins->dreg);
break;
case OP_XCONST: {
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(gpointer));
amd64_sse_movups_reg_membase (code, ins->dreg, AMD64_R11, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
amd64_sse_movups_reg_membase (code, ins->dreg, AMD64_RIP, 0);
}
break;
}
case OP_ICONV_TO_R4_RAW:
amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4);
if (!cfg->r4fp)
Expand Down Expand Up @@ -8140,11 +8151,13 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
for (patch_info = cfg->patch_info; patch_info; patch_info = patch_info->next) {
if (patch_info->type == MONO_PATCH_INFO_EXC)
code_size += 40;
if (patch_info->type == MONO_PATCH_INFO_R8)
else if (patch_info->type == MONO_PATCH_INFO_X128)
code_size += 16 + 15; /* sizeof (Vector128<T>) + alignment */
else if (patch_info->type == MONO_PATCH_INFO_R8)
code_size += 8 + 15; /* sizeof (double) + alignment */
if (patch_info->type == MONO_PATCH_INFO_R4)
else if (patch_info->type == MONO_PATCH_INFO_R4)
code_size += 4 + 15; /* sizeof (float) + alignment */
if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
else if (patch_info->type == MONO_PATCH_INFO_GC_CARD_TABLE_ADDR)
Comment on lines +8154 to +8160
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per the FIXME comment below, the alignment here only needs to be 8 for double and 4 for float. But some other logic was utilizing R4/R8 for packed SIMD instructions.

That should ideally be fixed and those updated to properly use X128 constants to save on space and help ensure correctness.

code_size += 8 + 7; /*sizeof (void*) + alignment */
}

Expand Down Expand Up @@ -8213,11 +8226,16 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
guint8 *orig_code = code;

switch (patch_info->type) {
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_R8:
case MONO_PATCH_INFO_R4: {
guint8 *pos, *patch_pos;
guint32 target_pos;

// FIXME: R8 and R4 only need 8 and 4 byte alignment, respectively
// They should be handled separately with anything needing 16 byte
// alignment using X128

/* The SSE opcodes require a 16 byte alignment */
code = (guint8*)ALIGN_TO (code, 16);

Expand All @@ -8231,7 +8249,10 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
target_pos = GPTRDIFF_TO_UINT32 (code - pos - 8);
}

if (patch_info->type == MONO_PATCH_INFO_R8) {
if (patch_info->type == MONO_PATCH_INFO_X128) {
memcpy (code, patch_info->data.target, 16);
code += 16;
} else if (patch_info->type == MONO_PATCH_INFO_R8) {
*(double*)code = *(double*)patch_info->data.target;
code += sizeof (double);
} else {
Expand Down
80 changes: 80 additions & 0 deletions src/mono/mono/mini/mini-llvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -7844,6 +7844,86 @@ MONO_RESTORE_WARNING
values [ins->dreg] = LLVMConstAllOnes (type_to_llvm_type (ctx, m_class_get_byval_arg (ins->klass)));
break;
}
case OP_XCONST: {
MonoTypeEnum etype;
int ecount;

const char *class_name = m_class_get_name (ins->klass);

if (!strcmp(class_name, "Vector64`1") || !strcmp (class_name, "Vector128`1") || !strcmp (class_name, "Vector256`1") || !strcmp (class_name, "Vector512`1")) {
MonoType *element_type = mono_class_get_context (ins->klass)->class_inst->type_argv [0];
etype = element_type->type;
ecount = mono_class_value_size (ins->klass, NULL) / mono_class_value_size (mono_class_from_mono_type_internal(element_type), NULL);
} else if (!strcmp(class_name, "Vector4") || !strcmp(class_name, "Plane") || !strcmp(class_name, "Quaternion")) {
etype = MONO_TYPE_R4;
ecount = 4;
} else {
g_assert_not_reached ();
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wanted to leave a more general comment that Mono currently has to do these class name and type checks to determine things.

It would be beneficial if there was a way to track that the simdType and simdBaseType. In RyuJIT, we have TYP_SIMD8/12/16/32/64 and we then track the primitive base type as a separate field.

This allows better codegen, better specialization, and all-over cheaper checks. Even if this was just some helper method that cached a klass to simd info lookup, I think it would improve/simplify things.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a simd_class_to_llvm_type () function which can be used here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the reference @vargaz!

I'm not sure that really addresses the backing issue of there still needing to be string comparisons to do the checks in the first place and it doesn't help things on the MonoJIT side.

My comment was more about the general cost of going from a klass to the respective SimdType and SimdBaseType and how if Mono had a different way to track this it could be done cheaply once at import.

Subsequent handling, including simd_class_to_llvm_type could then be made cheaper.


LLVMTypeRef llvm_type = primitive_type_to_llvm_type(etype);
LLVMValueRef vals [64];

switch (etype) {
case MONO_TYPE_I1:
case MONO_TYPE_U1: {
guint8* v = (guint8*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_I2:
case MONO_TYPE_U2: {
guint16* v = (guint16*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_I4:
case MONO_TYPE_U4: {
guint32* v = (guint32*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_I8:
case MONO_TYPE_U8: {
guint64* v = (guint64*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstInt (llvm_type, v[i], FALSE);
}
break;
}
case MONO_TYPE_R4: {
float* v = (float*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstReal (llvm_type, v[i]);
}
break;
}
case MONO_TYPE_R8: {
double* v = (double*)ins->inst_p0;

for (int i = 0; i < ecount; ++i) {
vals [i] = LLVMConstReal (llvm_type, v[i]);
}
break;
}
default:
g_assert_not_reached ();
}

values [ins->dreg] = LLVMConstVector (vals, ecount);
break;
}
case OP_LOADX_MEMBASE: {
LLVMTypeRef t = type_to_llvm_type (ctx, m_class_get_byval_arg (ins->klass));
LLVMValueRef src;
Expand Down
1 change: 1 addition & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,7 @@ MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)
MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE)
MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE)
MINI_OP(OP_XONES, "xones", XREG, NONE, NONE)
MINI_OP(OP_XCONST, "xconst", XREG, NONE, NONE)
MINI_OP(OP_XPHI, "xphi", XREG, NONE, NONE)

/*
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/mini/mini-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,8 @@ mono_patch_info_hash (gconstpointer data)
}
case MONO_PATCH_INFO_GSHAREDVT_IN_WRAPPER:
return hash | mono_signature_hash (ji->data.sig);
case MONO_PATCH_INFO_X128_GOT:
return hash | (guint32)*(double*)ji->data.target;
case MONO_PATCH_INFO_R8_GOT:
return hash | (guint32)*(double*)ji->data.target;
Comment on lines +1284 to 1287
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how the hashes are currently being built for R4/R8, but they don't necessarily make "sense" to me.

This isn't a "good" hash since it just or's bits together, it doesn't take into account the "upper bits", and it relies on undefined behavior in the case the double or float is NaN/Infinity/out of range.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not the best, but its just used during JITting/AOTing, so it's not a problem in practice.

case MONO_PATCH_INFO_R4_GOT:
Expand Down Expand Up @@ -1605,6 +1607,8 @@ mono_resolve_patch_target_ext (MonoMemoryManager *mem_manager, MonoMethod *metho
case MONO_PATCH_INFO_R4_GOT:
case MONO_PATCH_INFO_R8:
case MONO_PATCH_INFO_R8_GOT:
case MONO_PATCH_INFO_X128:
case MONO_PATCH_INFO_X128_GOT:
target = patch_info->data.target;
break;
case MONO_PATCH_INFO_EXC_NAME:
Expand Down
6 changes: 4 additions & 2 deletions src/mono/mono/mini/patch-info.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ PATCH_INFO(LDTOKEN, "ldtoken")
PATCH_INFO(TYPE_FROM_HANDLE, "type_from_handle")
PATCH_INFO(R4, "r4")
PATCH_INFO(R8, "r8")
PATCH_INFO(X128, "x128")
PATCH_INFO(IP, "ip")
PATCH_INFO(IID, "iid")
PATCH_INFO(ADJUSTED_IID, "adjusted_iid")
Expand Down Expand Up @@ -76,12 +77,13 @@ PATCH_INFO(SPECIFIC_TRAMPOLINES, "specific_trampolines")
PATCH_INFO(SPECIFIC_TRAMPOLINES_GOT_SLOTS_BASE, "specific_trampolines_got_slots_base")

/*
* PATCH_INFO_R4/R8 is handled by mono_arch_emit_exceptions () by emitting the data
* into the text segment after the method body. These patches emit the data
* PATCH_INFO_* is handled by mono_arch_emit_exceptions () by emitting the data
* into the text segment after the method body. PATCH_INFO_*_GOT emit the data
* elsewhere.
*/
PATCH_INFO(R8_GOT, "r8_got")
PATCH_INFO(R4_GOT, "r4_got")
PATCH_INFO(X128_GOT, "x128_got")

/* MonoMethod* -> the address of a memory slot which is used to cache the pinvoke address */
PATCH_INFO(METHOD_PINVOKE_ADDR_CACHE, "pinvoke_addr_cache")
Expand Down
Loading