@@ -916,23 +916,24 @@ struct sm100_tuning<Input,
916
916
using delay_constructor = exponential_backon_jitter_window_constructor_t <716 , 570 >;
917
917
};
918
918
919
- template <class Input >
920
- struct sm100_tuning <Input,
921
- flagged::no,
922
- keep_rejects::yes,
923
- offset_size::_4,
924
- primitive::yes,
925
- input_size::_8,
926
- may_alias::no,
927
- distinct_partitions::yes>
928
- {
929
- // trp_1.ld_0.ipt_20.tpb_416.ns_1672.dcid_7.l2w_1050 1.086221 0.977775 1.090731 1.257618
930
- static constexpr int nominal_4b_items = 20 ;
931
- static constexpr int threads = 416 ;
932
- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
933
- static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
934
- using delay_constructor = exponential_backon_constructor_t <1672 , 1050 >;
935
- };
919
+ // TODO(gonidelis): Tuning Regresses for large input sizes. Find better tuning.
920
+ // template <class Input>
921
+ // struct sm100_tuning<Input,
922
+ // flagged::no,
923
+ // keep_rejects::yes,
924
+ // offset_size::_4,
925
+ // primitive::yes,
926
+ // input_size::_8,
927
+ // may_alias::no,
928
+ // distinct_partitions::yes>
929
+ // {
930
+ // // trp_1.ld_0.ipt_20.tpb_416.ns_1672.dcid_7.l2w_1050 1.086221 0.977775 1.090731 1.257618
931
+ // static constexpr int nominal_4b_items = 20;
932
+ // static constexpr int threads = 416;
933
+ // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
934
+ // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
935
+ // using delay_constructor = exponential_backon_constructor_t<1672, 1050>;
936
+ // };
936
937
937
938
template <class Input >
938
939
struct sm100_tuning <Input,
0 commit comments