Skip to content

Commit

Permalink
Fix may_alias::yes in partition tunings, offset::size selection and p…
Browse files Browse the repository at this point in the history
…ass template parameter to Nominal4BItemsToItems call
  • Loading branch information
gonidelis authored and bernhardmgruber committed Feb 4, 2025
1 parent 571fc23 commit 8fbe0f9
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 34 deletions.
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ template <typename InputIteratorT,
typename PolicyHub =
detail::select::policy_hub<detail::value_t<InputIteratorT>,
detail::value_t<FlagsInputIteratorT>,
detail::select::per_partition_offset_t,
OffsetT,
detail::select::is_partition_distinct_output_t<SelectedOutputIteratorT>::value,
(SelectionOpt == SelectImpl::SelectPotentiallyInPlace),
(SelectionOpt == SelectImpl::Partition)>>
Expand Down
85 changes: 52 additions & 33 deletions cub/cub/device/dispatch/tuning/tuning_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_15.tpb_608.ns_676.dcid_7.l2w_500 1.171303 1.042818 1.175890 1.456731
Expand All @@ -887,7 +887,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_22.tpb_320.ns_1756.dcid_6.l2w_615 1.206387 1.079118 1.202408 1.307692
Expand All @@ -905,7 +905,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_19.tpb_320.ns_716.dcid_5.l2w_570 1.177521 1.123348 1.177703 1.307692
Expand All @@ -923,7 +923,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_20.tpb_416.ns_1672.dcid_7.l2w_1050 1.086221 0.977775 1.090731 1.257618
Expand All @@ -941,7 +941,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_22.tpb_576.ns_368.dcid_7.l2w_680 1.191750 0.990521 1.175654 1.433174
Expand All @@ -959,7 +959,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_20.tpb_608.ns_516.dcid_7.l2w_635 1.244961 0.848558 1.212567 1.461538
Expand All @@ -977,7 +977,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_18.tpb_608.ns_1712.dcid_5.l2w_825 1.255078 0.990588 1.231055 1.421176
Expand All @@ -995,7 +995,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_14.tpb_512.ns_1468.dcid_7.l2w_820 1.111830 1.011070 1.119481 1.245868
Expand All @@ -1013,7 +1013,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_22.tpb_224.ns_68.dcid_2.l2w_990 1.151989 1.064433 1.146707 1.305288
Expand All @@ -1031,7 +1031,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_22.tpb_320.ns_560.dcid_5.l2w_640 1.205538 1.080520 1.201709 1.307692
Expand All @@ -1049,7 +1049,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_19.tpb_608.ns_724.dcid_5.l2w_970 1.196592 0.982227 1.177984 1.310843
Expand All @@ -1067,7 +1067,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_23.tpb_416.ns_1608.dcid_2.l2w_560 1.099752 0.977393 1.106477 1.259336
Expand All @@ -1085,7 +1085,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_20.tpb_608.ns_1016.dcid_6.l2w_545 1.239144 1.002404 1.225460 1.444711
Expand All @@ -1103,7 +1103,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_22.tpb_288.ns_124.dcid_2.l2w_690 1.202783 1.000000 1.183737 1.311755
Expand All @@ -1121,7 +1121,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_19.tpb_608.ns_1884.dcid_6.l2w_950 1.250302 0.988124 1.225191 1.392931
Expand All @@ -1139,7 +1139,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_23.tpb_416.ns_0.dcid_2.l2w_1200 1.156864 1.011990 1.152368 1.266667
Expand All @@ -1150,6 +1150,21 @@ struct sm100_tuning<Input,
using delay_constructor = exponential_backoff_constructor_t<0, 1200>;
};

// because we introduced cases for when offset is I64 this leads to regressions if not
// defaulted explicitly
template <distinct_partitions DistinctPartitions>
struct sm100_tuning<__int128_t,
flagged::no,
keep_rejects::yes,
offset_size::_8,
primitive::no,
input_size::_16,
may_alias::no,
DistinctPartitions>
: sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
// ^^^^^ this base is wrong and leads to regressions ^^^^^
{};

// partition::flagged
template <class Input>
struct sm100_tuning<Input,
Expand All @@ -1158,7 +1173,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_20.tpb_448.ns_964.dcid_7.l2w_385 1.111204 1.036205 1.111986 1.275210
Expand All @@ -1176,7 +1191,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_18.tpb_256.ns_300.dcid_6.l2w_820 1.107466 0.923750 1.126995 1.346591
Expand All @@ -1194,7 +1209,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_19.tpb_256.ns_1608.dcid_7.l2w_675 1.097548 0.964114 1.109189 1.283333
Expand All @@ -1212,7 +1227,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_21.tpb_384.ns_300.dcid_7.l2w_580 1.239128 1.019324 1.238373 1.347458
Expand All @@ -1230,7 +1245,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_1.ipt_20.tpb_448.ns_240.dcid_6.l2w_845 1.097180 0.990453 1.091667 1.452153
Expand All @@ -1248,7 +1263,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_14.tpb_320.ns_1428.dcid_7.l2w_830 1.380164 1.133333 1.367514 1.628793
Expand All @@ -1266,7 +1281,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_14.tpb_640.ns_1204.dcid_5.l2w_635 1.155209 1.000000 1.143742 1.380659
Expand All @@ -1284,7 +1299,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_19.tpb_384.ns_1016.dcid_7.l2w_875 1.227540 1.181818 1.223936 1.261954
Expand All @@ -1302,7 +1317,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_24.tpb_256.ns_2024.dcid_5.l2w_835 1.146782 1.001841 1.149438 1.439904
Expand All @@ -1320,7 +1335,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_18.tpb_256.ns_1832.dcid_5.l2w_590 1.128674 0.984403 1.150806 1.355932
Expand All @@ -1338,7 +1353,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_11.tpb_448.ns_476.dcid_7.l2w_665 1.173664 1.035556 1.186114 1.393153
Expand All @@ -1356,7 +1371,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_20.tpb_384.ns_1420.dcid_5.l2w_525 (39_new/2.db) 1.157326 1.110920 1.162458 1.259336
Expand All @@ -1374,7 +1389,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_12.tpb_256.ns_0.dcid_5.l2w_850 1.150864 1.005760 1.157687 1.395833
Expand All @@ -1392,7 +1407,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_12.tpb_256.ns_1552.dcid_7.l2w_730 1.374892 1.171831 1.360076 1.513390
Expand All @@ -1410,7 +1425,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_14.tpb_352.ns_1444.dcid_5.l2w_655 1.183452 1.000000 1.177224 1.402083
Expand All @@ -1428,7 +1443,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_11.tpb_512.ns_536.dcid_2.l2w_845 1.248969 1.184659 1.251631 1.360795
Expand Down Expand Up @@ -1564,11 +1579,15 @@ struct policy_hub
template <typename Tuning>
static auto select_agent_policy100(long) -> typename Policy900::SelectIfPolicyT;

// We use KeepRejects to differentiate between partition and select in the tunings.
// If KeepRejects is true we tune for partition otherwise we tune for select.
static constexpr offset_size offset_t = KeepRejects ? classify_offset_size<OffsetT>() : offset_size::_4;

using SelectIfPolicyT =
decltype(select_agent_policy100<sm100_tuning<InputT,
is_flagged<FlagT>(),
are_rejects_kept<KeepRejects>(),
classify_offset_size<OffsetT>(),
offset_t,
is_primitive<InputT>(),
classify_input_size<InputT>(),
should_alias<MayAlias>(),
Expand Down

0 comments on commit 8fbe0f9

Please sign in to comment.