Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose various Convert intrinsics for Avx512F, Avx512BW, and Avx512DQ #85281

Merged
merged 14 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 35 additions & 6 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5658,18 +5658,47 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
break;
}

case NI_AVX512F_ConvertToVector256Int32:
case NI_AVX512F_ConvertToVector256UInt32:
case NI_AVX512F_VL_ConvertToVector128UInt32:
case NI_AVX512F_VL_ConvertToVector128UInt32WithSaturation:
{
assert(!varTypeIsFloating(baseType));
FALLTHROUGH;
}

case NI_AVX512F_ConvertToVector128Byte:
case NI_AVX512F_ConvertToVector128ByteWithSaturation:
case NI_AVX512F_ConvertToVector128Int16:
case NI_AVX512F_ConvertToVector128Int32:
case NI_AVX512F_ConvertToVector128Int16WithSaturation:
case NI_AVX512F_ConvertToVector128SByte:
case NI_AVX512F_ConvertToVector128SByteWithSaturation:
case NI_AVX512F_ConvertToVector128UInt16:
case NI_AVX512F_ConvertToVector128UInt32:
case NI_AVX512F_ConvertToVector128UInt16WithSaturation:
case NI_AVX512F_ConvertToVector256Int16:
case NI_AVX512F_ConvertToVector256Int32:
case NI_AVX512F_ConvertToVector256Int16WithSaturation:
case NI_AVX512F_ConvertToVector256Int32WithSaturation:
case NI_AVX512F_ConvertToVector256UInt16:
case NI_AVX512F_ConvertToVector256UInt32:
case NI_AVX512BW_ConvertToVector128Byte:
case NI_AVX512BW_ConvertToVector128SByte:
case NI_AVX512F_ConvertToVector256UInt16WithSaturation:
case NI_AVX512F_ConvertToVector256UInt32WithSaturation:
case NI_AVX512F_VL_ConvertToVector128Byte:
case NI_AVX512F_VL_ConvertToVector128ByteWithSaturation:
case NI_AVX512F_VL_ConvertToVector128Int16:
case NI_AVX512F_VL_ConvertToVector128Int16WithSaturation:
case NI_AVX512F_VL_ConvertToVector128Int32:
case NI_AVX512F_VL_ConvertToVector128Int32WithSaturation:
case NI_AVX512F_VL_ConvertToVector128SByte:
case NI_AVX512F_VL_ConvertToVector128SByteWithSaturation:
case NI_AVX512F_VL_ConvertToVector128UInt16:
case NI_AVX512F_VL_ConvertToVector128UInt16WithSaturation:
case NI_AVX512BW_ConvertToVector256Byte:
case NI_AVX512BW_ConvertToVector256ByteWithSaturation:
case NI_AVX512BW_ConvertToVector256SByte:
case NI_AVX512BW_ConvertToVector256SByteWithSaturation:
case NI_AVX512BW_VL_ConvertToVector128Byte:
case NI_AVX512BW_VL_ConvertToVector128ByteWithSaturation:
case NI_AVX512BW_VL_ConvertToVector128SByte:
case NI_AVX512BW_VL_ConvertToVector128SByteWithSaturation:
{
// These intrinsics are "ins reg/mem, xmm"
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
Expand Down
69 changes: 61 additions & 8 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1905,7 +1905,7 @@ class emitter
ssize_t emitGetInsCIdisp(instrDesc* id);
unsigned emitGetInsCIargs(instrDesc* id);

inline static emitAttr emitGetMemOpSize(instrDesc* id);
inline emitAttr emitGetMemOpSize(instrDesc* id) const;

// Return the argument count for a direct call "id".
int emitGetInsCDinfo(instrDesc* id);
Expand Down Expand Up @@ -3456,11 +3456,12 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
// Arguments:
// id - Instruction descriptor
//
/* static */ emitAttr emitter::emitGetMemOpSize(instrDesc* id)
emitAttr emitter::emitGetMemOpSize(instrDesc* id) const
{
emitAttr defaultSize = id->idOpSize();
emitAttr defaultSize = id->idOpSize();
instruction ins = id->idIns();

switch (id->idIns())
switch (ins)
{
case INS_pextrb:
case INS_pinsrb:
Expand Down Expand Up @@ -3570,9 +3571,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)

case INS_cvtdq2pd:
case INS_cvtps2pd:
case INS_vpmovdw:
case INS_vpmovqd:
case INS_vpmovwb:
{
if (defaultSize == 64)
{
Expand All @@ -3589,6 +3587,57 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
}
}

case INS_vpmovdb:
case INS_vpmovdw:
case INS_vpmovqb:
case INS_vpmovqd:
case INS_vpmovqw:
case INS_vpmovwb:
case INS_vpmovsdb:
case INS_vpmovsdw:
case INS_vpmovsqb:
case INS_vpmovsqd:
case INS_vpmovsqw:
case INS_vpmovswb:
case INS_vpmovusdb:
case INS_vpmovusdw:
case INS_vpmovusqb:
case INS_vpmovusqd:
case INS_vpmovusqw:
case INS_vpmovuswb:
{
insTupleType tupleType = insTupleTypeInfo(ins);
unsigned memSize = 0;

switch (tupleType)
{
case INS_TT_HALF_MEM:
{
memSize = defaultSize / 2;
break;
}

case INS_TT_QUARTER_MEM:
{
memSize = defaultSize / 4;
break;
}

case INS_TT_EIGHTH_MEM:
{
memSize = defaultSize / 8;
break;
}

default:
{
unreached();
}
}

return EA_ATTR(memSize);
}

case INS_vbroadcastf128:
case INS_vbroadcasti128:
case INS_vextractf128:
Expand All @@ -3613,7 +3662,11 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)

case INS_movddup:
{
if (defaultSize == 32)
if (defaultSize == 64)
{
return EA_64BYTE;
}
else if (defaultSize == 32)
{
return EA_32BYTE;
}
Expand Down
100 changes: 88 additions & 12 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,10 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
case INS_shlx:
case INS_shrx:
#endif // TARGET_AMD64
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
if (attr == EA_8BYTE)
{
Expand Down Expand Up @@ -2582,6 +2586,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
case INS_sarx:
case INS_shrx:
#endif
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
// These SSE instructions write to a general purpose integer register.
return false;
Expand Down Expand Up @@ -3010,7 +3018,7 @@ inline bool hasTupleTypeInfo(instruction ins)
// Return Value:
// the tuple type info for a given CPU instruction.
//
inline insTupleType insTupleTypeInfo(instruction ins)
insTupleType emitter::insTupleTypeInfo(instruction ins) const
{
assert((unsigned)ins < ArrLen(insTupleTypeInfos));
assert(insTupleTypeInfos[ins] != INS_TT_NONE);
Expand All @@ -3020,9 +3028,9 @@ inline insTupleType insTupleTypeInfo(instruction ins)
// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
{
const size_t SSE38 = 0x0F660038;
const size_t SSE3A = 0x0F66003A;
const size_t MASK = 0xFFFF00FF;
const size_t SSE38 = 0x0F000038;
const size_t SSE3A = 0x0F00003A;
const size_t MASK = 0xFF0000FF;

size_t insCode = 0;

Expand All @@ -3044,8 +3052,19 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
insCode = insCodeMR(ins);
}

insCode &= MASK;
return insCode == SSE38 || insCode == SSE3A;
size_t mskCode = insCode & MASK;

if ((mskCode != SSE38) && (mskCode != SSE3A))
{
return false;
}

#if defined(DEBUG)
insCode = (insCode >> 16) & 0xFF;
assert((insCode == 0x66) || (insCode == 0xF2) || (insCode == 0xF3));
#endif // DEBUG

return true;
}

/*****************************************************************************
Expand Down Expand Up @@ -11214,6 +11233,10 @@ void emitter::emitDispIns(
case INS_cvtss2si:
case INS_cvtsd2si:
case INS_cvttss2si:
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
break;
Expand Down Expand Up @@ -15528,9 +15551,9 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
disp8Compression = inputSize * 4;
break;
case INS_TT_TUPLE8:
// N = input size in bytes * 4, 32bit for 512 only
// N = input size in bytes * 8, 32bit for 512 only
assert((inputSize == 4 && vectorLength >= 64));
disp8Compression = inputSize * 4;
disp8Compression = inputSize * 8;
break;
case INS_TT_HALF_MEM:
// N = vector length in bytes / 2
Expand Down Expand Up @@ -17825,11 +17848,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvttps2dq:
case INS_cvtps2dq:
case INS_cvtdq2ps:
case INS_vcvtpd2qq:
case INS_vcvtpd2uqq:
case INS_vcvtps2udq:
case INS_vcvtqq2pd:
case INS_vcvttps2udq:
case INS_vcvtudq2ps:
case INS_vcvttpd2qq:
case INS_vcvttpd2uqq:
case INS_vcvtuqq2pd:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
break;

case INS_vpmovdb:
case INS_vpmovdw:
case INS_vpmovqb:
case INS_vpmovqd:
case INS_vpmovqw:
case INS_vpmovsdb:
case INS_vpmovsdw:
case INS_vpmovsqb:
case INS_vpmovsqd:
case INS_vpmovsqw:
case INS_vpmovswb:
case INS_vpmovusdb:
case INS_vpmovusdw:
case INS_vpmovusqb:
case INS_vpmovusqd:
case INS_vpmovusqw:
case INS_vpmovuswb:
case INS_vpmovwb:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C;
break;

case INS_haddps:
Expand Down Expand Up @@ -17892,12 +17943,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvtsi2ss32:
case INS_cvtsi2sd64:
case INS_cvtsi2ss64:
case INS_vcvtsd2usi:
case INS_vcvttsd2usi:
case INS_vcvtusi2sd32:
case INS_vcvtusi2sd64:
case INS_vcvtusi2ss32:
case INS_vcvtusi2ss64:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_7C;
break;

case INS_cvttss2si:
case INS_cvtss2si:
case INS_vcvtss2usi:
case INS_vcvttss2usi:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
break;
Expand Down Expand Up @@ -18241,6 +18300,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvtdq2pd:
case INS_cvtpd2ps:
case INS_cvttpd2dq:
case INS_vcvtpd2udq:
case INS_vcvtps2qq:
case INS_vcvtps2uqq:
case INS_vcvtqq2ps:
case INS_vcvttpd2udq:
case INS_vcvttps2qq:
case INS_vcvttps2uqq:
case INS_vcvtudq2pd:
case INS_vcvtuqq2ps:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C;
break;
Expand Down Expand Up @@ -18282,17 +18350,25 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vpbroadcastq_gpr:
case INS_vbroadcasti128:
case INS_vbroadcastf128:
case INS_vbroadcastf64x2:
case INS_vbroadcasti64x2:
case INS_vbroadcastf64x4:
case INS_vbroadcasti64x4:
case INS_vbroadcastf32x2:
case INS_vbroadcasti32x2:
case INS_vbroadcastf32x8:
case INS_vbroadcasti32x8:
case INS_vbroadcastss:
case INS_vbroadcastsd:
if (memAccessKind == PERFSCORE_MEMORY_NONE)
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C;
result.insLatency = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C;
}
else
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C;
result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C;
if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw)
{
result.insLatency += PERFSCORE_LATENCY_1C;
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
return code;
}

insTupleType insTupleTypeInfo(instruction ins) const;

//------------------------------------------------------------------------
// HasKMaskRegisterDest: Temporary check to identify instructions that can
// be Evex encoded but require Opmask(KMask) register support.
Expand Down
Loading