Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9fccdf1
move scan helpers, reorganize, add comments
danhoeflinger May 27, 2025
b810440
Add endline EOF
danhoeflinger May 27, 2025
d0afa4d
typo
danhoeflinger May 27, 2025
8b7b590
move to existing header
danhoeflinger May 28, 2025
4a1bec3
abstract bounds on searching
danhoeflinger May 29, 2025
59a8bae
dealing with end bounds
danhoeflinger May 29, 2025
a7044fa
update and improve sycl traits and tests
danhoeflinger May 29, 2025
a7a2f6c
compiling
danhoeflinger Jun 2, 2025
04060ee
bugfixes (passing)
danhoeflinger Jun 3, 2025
003b304
avoid recalculation of base diags
danhoeflinger Jun 3, 2025
25b8a29
revert hardcoding reduce then scan
danhoeflinger Jun 3, 2025
e7b7fc7
shrink partitioning by factor of 2
danhoeflinger Jun 3, 2025
3cfab88
clang format
danhoeflinger Jun 20, 2025
f7b59e0
Revert "shrink partitioning by factor of 2"
danhoeflinger Jun 20, 2025
d37d552
remove extra space
danhoeflinger Jun 20, 2025
0809ec2
removing unused alias
danhoeflinger Jun 20, 2025
1348e77
removing unused variables
danhoeflinger Jun 20, 2025
50d28a3
formatting
danhoeflinger Jun 20, 2025
6cedb26
minor improvements
danhoeflinger Jun 23, 2025
feb5eb8
fix balanced path unit test
danhoeflinger Jun 23, 2025
a5d7930
offer both partitioning and no partitioning with 1 kernel
danhoeflinger Jun 27, 2025
5e995ff
address some feedback
danhoeflinger Jul 3, 2025
fa99560
remove stale TODO
danhoeflinger Jul 3, 2025
c95ef42
move trailing type to explicit type
danhoeflinger Jul 3, 2025
99dc03e
weighting size of element
danhoeflinger Jul 3, 2025
6f495c8
Revert "weighting size of element"
danhoeflinger Jul 3, 2025
a9c697f
improving workgroup size helper
danhoeflinger Jul 14, 2025
b3c9236
shrink type for subgroup size
danhoeflinger Jul 14, 2025
b79c8b7
adding clarifying comment
danhoeflinger Jul 14, 2025
a9dc022
formatting fixes
danhoeflinger Jul 14, 2025
4b61133
extra space
danhoeflinger Jul 14, 2025
fe32a7d
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_the…
SergeyKopienko Jul 15, 2025
188c86a
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_the…
SergeyKopienko Jul 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,6 @@ __group_scan_fits_in_slm(const sycl::queue& __q, std::size_t __n, std::size_t __
return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
}


template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
__future<sycl::event, __result_and_scratch_storage<typename _InitType::__value_type>>
Expand Down Expand Up @@ -1059,34 +1058,54 @@ __parallel_set_reduce_then_scan(sycl::queue& __q, _Range1&& __rng1, _Range2&& __
using _TempData = __temp_data_array<__diagonal_spacing, _OutValueT>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range3>;
using _ReduceOp = std::plus<_Size>;
using _BoundsProvider = oneapi::dpl::__par_backend_hetero::__get_bounds_partitioned;

using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_set_balanced_path<_SetOperation, _Compare>;
using _GenReduceInput =
oneapi::dpl::__par_backend_hetero::__gen_set_balanced_path<_SetOperation, _BoundsProvider, _Compare>;
using _GenScanInput =
oneapi::dpl::__par_backend_hetero::__gen_set_op_from_known_balanced_path<_SetOperation, _TempData, _Compare>;
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_multiple_to_id<oneapi::dpl::__internal::__pstl_assign>;

const std::int32_t __num_diagonals =
oneapi::dpl::__internal::__dpl_ceiling_div(__rng1.size() + __rng2.size(), __diagonal_spacing);

const std::uint32_t __work_group_size = __get_reduce_then_scan_workgroup_size(__q);
const std::size_t __partition_threshold = 2 * 1024 * 1024;
const std::size_t __total_size = __rng1.size() + __rng2.size();
// Should be safe to use the type of the range size as the temporary type. Diagonal index will fit in the positive
// portion of the range so star flag can use sign bit.
using _TemporaryType = decltype(__rng1.size());
//TODO: limit to diagonals per block, and only write to a block based index of temporary data
oneapi::dpl::__par_backend_hetero::__buffer<_TemporaryType> __temp_diags(__num_diagonals);

constexpr std::uint32_t __average_input_ele_size = (sizeof(_In1ValueT) + sizeof(_In2ValueT)) / 2;
// Partition into blocks of half SLM size
const std::size_t __partition_size =
__q.get_device().template get_info<sycl::info::device::local_mem_size>() / (__average_input_ele_size * 2);

_GenReduceInput __gen_reduce_input{_SetOperation{}, __diagonal_spacing,
_BoundsProvider{__diagonal_spacing, __partition_size, __partition_threshold},
__comp};

constexpr std::uint32_t __bytes_per_work_item_iter =
((sizeof(_In1ValueT) + sizeof(_In2ValueT)) / 2) * (__diagonal_spacing + 1) + sizeof(_TemporaryType);
__average_input_ele_size * (__diagonal_spacing + 1) + sizeof(_TemporaryType);

auto __in_rng = oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<_TemporaryType, __par_backend_hetero::access_mode::read_write>(
__temp_diags.get_buffer()));
sycl::event __partition_event{};

if (__total_size >= __partition_threshold)
{
__partition_event =
__parallel_set_balanced_path_partition<_CustomName>(__q, __in_rng, __num_diagonals, __gen_reduce_input);
}
return __parallel_transform_reduce_then_scan<__bytes_per_work_item_iter, _CustomName>(
__q, __num_diagonals,
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<_TemporaryType, __par_backend_hetero::access_mode::read_write>(
__temp_diags.get_buffer())),
std::forward<_Range3>(__result), _GenReduceInput{_SetOperation{}, __diagonal_spacing, __comp}, _ReduceOp{},
__q, __num_diagonals, __in_rng, std::forward<_Range3>(__result), __gen_reduce_input, _ReduceOp{},
_GenScanInput{_SetOperation{}, __diagonal_spacing, __comp}, _ScanInputTransform{}, _WriteOp{},
oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{},
/*__is_unique_pattern=*/std::false_type{});
/*__is_unique_pattern=*/std::false_type{}, __work_group_size, __partition_event);
}

template <typename _CustomName, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
Expand Down
Loading