Skip to content
Merged
Show file tree
Hide file tree
Changes from 44 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
0f1cb92
move scan helpers, reorganize, add comments
danhoeflinger May 27, 2025
670d85d
Add endline EOF
danhoeflinger May 27, 2025
61afae8
typo
danhoeflinger May 27, 2025
e6eca97
move to existing header
danhoeflinger May 28, 2025
a3e871c
abstract bounds on searching
danhoeflinger May 29, 2025
28ab243
dealing with end bounds
danhoeflinger May 29, 2025
0e884b3
update and improve sycl traits and tests
danhoeflinger May 29, 2025
a14ff74
compiling
danhoeflinger Jun 2, 2025
d22a0a6
bugfixes (passing)
danhoeflinger Jun 3, 2025
7cd2986
avoid recalculation of base diags
danhoeflinger Jun 3, 2025
cf0d209
revert hardcoding reduce then scan
danhoeflinger Jun 3, 2025
d0d0b55
shrink partitioning by factor of 2
danhoeflinger Jun 3, 2025
a22d5a1
clang format
danhoeflinger Jun 20, 2025
0be9a86
Revert "shrink partitioning by factor of 2"
danhoeflinger Jun 20, 2025
4702187
remove extra space
danhoeflinger Jun 20, 2025
ce74847
removing unused alias
danhoeflinger Jun 20, 2025
688be46
removing unused variables
danhoeflinger Jun 20, 2025
90d9927
formatting
danhoeflinger Jun 20, 2025
678c9e1
minor improvements
danhoeflinger Jun 23, 2025
0b66512
fix balanced path unit test
danhoeflinger Jun 23, 2025
e9c0a74
offer both partitioning and no partitioning with 1 kernel
danhoeflinger Jun 27, 2025
0bb1843
address some feedback
danhoeflinger Jul 3, 2025
0df2c61
remove stale TODO
danhoeflinger Jul 3, 2025
5ac22c9
move trailing type to explicit type
danhoeflinger Jul 3, 2025
d1b5513
weighting size of element
danhoeflinger Jul 3, 2025
4328208
Revert "weighting size of element"
danhoeflinger Jul 3, 2025
9191560
improving workgroup size helper
danhoeflinger Jul 14, 2025
193902a
shrink type for subgroup size
danhoeflinger Jul 14, 2025
ad27258
adding clarifying comment
danhoeflinger Jul 14, 2025
b804d9b
formatting fixes
danhoeflinger Jul 14, 2025
a86075f
extra space
danhoeflinger Jul 14, 2025
e1fdb87
adding clarifying comment
danhoeflinger Aug 15, 2025
d9f8319
revert workgroup size change
danhoeflinger Aug 15, 2025
ab3dbe7
better revert of workgroup size changes
danhoeflinger Aug 15, 2025
2cc4052
rename range for clarity
danhoeflinger Aug 15, 2025
77101bb
adding sycl device copyable submitter specialization
danhoeflinger Aug 21, 2025
1fb82a0
auto -> explicit types, branchless sign bit set
danhoeflinger Aug 29, 2025
9f4cf0f
adding numeric include
danhoeflinger Aug 29, 2025
ab17a5b
adding some comments
danhoeflinger Aug 29, 2025
3b4eed1
returning mistakenly removed comma
danhoeflinger Aug 29, 2025
031d995
fixing branchless negation
danhoeflinger Aug 29, 2025
b86cf8a
add in wrappers for broken submitter device copyable
danhoeflinger Sep 2, 2025
fa3b933
fix possible underflow
danhoeflinger Sep 2, 2025
3e3e9b8
better handling of signed types
danhoeflinger Sep 2, 2025
e8b977e
remove todo
danhoeflinger Sep 3, 2025
e2e785f
using fully qualified name
danhoeflinger Sep 3, 2025
0378d17
remove braces
danhoeflinger Sep 3, 2025
ed89965
Merge branch 'main' into dev/dhoeflin/partition_set_algs
SergeyKopienko Sep 3, 2025
8d3964e
adding const ref for const ranges
danhoeflinger Sep 3, 2025
1faa714
formatting
danhoeflinger Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,6 @@ __group_scan_fits_in_slm(const sycl::queue& __q, std::size_t __n, std::size_t __
return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
}


template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
__future<sycl::event, __result_and_scratch_storage<typename _InitType::__value_type>>
Expand Down Expand Up @@ -1062,34 +1061,55 @@ __parallel_set_reduce_then_scan(sycl::queue& __q, _Range1&& __rng1, _Range2&& __
using _TempData = __temp_data_array<__diagonal_spacing, _OutValueT>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range3>;
using _ReduceOp = std::plus<_Size>;
using _BoundsProvider = oneapi::dpl::__par_backend_hetero::__get_bounds_partitioned;

using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_set_balanced_path<_SetOperation, _Compare>;
using _GenReduceInput =
oneapi::dpl::__par_backend_hetero::__gen_set_balanced_path<_SetOperation, _BoundsProvider, _Compare>;
using _GenScanInput =
oneapi::dpl::__par_backend_hetero::__gen_set_op_from_known_balanced_path<_SetOperation, _TempData, _Compare>;
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_multiple_to_id<oneapi::dpl::__internal::__pstl_assign>;

const std::int32_t __num_diagonals =
oneapi::dpl::__internal::__dpl_ceiling_div(__rng1.size() + __rng2.size(), __diagonal_spacing);

const std::size_t __partition_threshold = 2 * 1024 * 1024;
const std::size_t __total_size = __rng1.size() + __rng2.size();
// Should be safe to use the type of the range size as the temporary type. Diagonal index will fit in the positive
// portion of the range so star flag can use sign bit.
using _TemporaryType = decltype(__rng1.size());
using _TemporaryType = std::make_signed_t<decltype(__rng1.size())>;
//TODO: limit to diagonals per block, and only write to a block based index of temporary data
oneapi::dpl::__par_backend_hetero::__buffer<_TemporaryType> __temp_diags(__num_diagonals);

constexpr std::uint32_t __average_input_ele_size = (sizeof(_In1ValueT) + sizeof(_In2ValueT)) / 2;

// Partition into blocks based on SLM size. We want this to fit within L1 cache, and SLM is a related concept and
// can be queried based upon the device. Performance is not sensitive to exact size in practice.
const std::size_t __partition_size =
__q.get_device().template get_info<sycl::info::device::local_mem_size>() / (__average_input_ele_size * 2);

_GenReduceInput __gen_reduce_input{_SetOperation{}, __diagonal_spacing,
_BoundsProvider{__diagonal_spacing, __partition_size, __partition_threshold},
__comp};

constexpr std::uint32_t __bytes_per_work_item_iter =
((sizeof(_In1ValueT) + sizeof(_In2ValueT)) / 2) * (__diagonal_spacing + 1) + sizeof(_TemporaryType);
__average_input_ele_size * (__diagonal_spacing + 1) + sizeof(_TemporaryType);

auto __in_in_tmp_rng = oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<_TemporaryType, __par_backend_hetero::access_mode::read_write>(
__temp_diags.get_buffer()));
sycl::event __partition_event{};

if (__total_size >= __partition_threshold)
{
__partition_event = __parallel_set_balanced_path_partition<_CustomName>(__q, __in_in_tmp_rng, __num_diagonals,
__gen_reduce_input);
}
return __parallel_transform_reduce_then_scan<__bytes_per_work_item_iter, _CustomName>(
__q, __num_diagonals,
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<_TemporaryType, __par_backend_hetero::access_mode::read_write>(
__temp_diags.get_buffer())),
std::forward<_Range3>(__result), _GenReduceInput{_SetOperation{}, __diagonal_spacing, __comp}, _ReduceOp{},
_GenScanInput{_SetOperation{}, __diagonal_spacing, __comp}, _ScanInputTransform{}, _WriteOp{},
__q, __num_diagonals, std::move(__in_in_tmp_rng), std::forward<_Range3>(__result), __gen_reduce_input,
_ReduceOp{}, _GenScanInput{_SetOperation{}, __diagonal_spacing, __comp}, _ScanInputTransform{}, _WriteOp{},
oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{},
/*__is_unique_pattern=*/std::false_type{});
/*__is_unique_pattern=*/std::false_type{}, __partition_event);
}

template <typename _CustomName, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
Expand Down
Loading
Loading