Skip to content

Commit 03da50f

Browse files
branch-4.0: [fix](iceberg) only enable dynamic partition pruning for identity partitions in Iceberg #58033 (#58182)
Cherry-picked from #58033 Co-authored-by: Socrates <[email protected]>
1 parent 436a73b commit 03da50f

File tree

6 files changed

+350
-5
lines changed

6 files changed

+350
-5
lines changed

docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,93 @@ VALUES (1, 'z', 0.00),
364364
(9, 'big', 9999999.99),
365365
(10, 'null', NULL);
366366

367+
-- =============================================
368+
-- Time transform partition coverage (day/year/month/hour)
369+
-- =============================================
370+
371+
-- Day transform by TIMESTAMP
372+
CREATE TABLE day_partitioned (
373+
id BIGINT,
374+
name STRING,
375+
ts TIMESTAMP
376+
) USING ICEBERG PARTITIONED BY (day(ts));
377+
378+
INSERT INTO day_partitioned VALUES
379+
(1, 'd1', TIMESTAMP '2024-01-15 08:00:00'),
380+
(2, 'd2', TIMESTAMP '2024-01-15 12:30:00'),
381+
(3, 'd3', TIMESTAMP '2024-01-15 23:59:59'),
382+
(4, 'd4', TIMESTAMP '2024-01-16 00:00:00'),
383+
(5, 'd5', TIMESTAMP '2024-01-16 10:00:00'),
384+
(6, 'd6', TIMESTAMP '2024-02-29 12:00:00'),
385+
(7, 'd7', TIMESTAMP '2024-12-31 23:59:59'),
386+
(8, 'null', NULL);
387+
388+
-- Year transform by TIMESTAMP
389+
CREATE TABLE year_partitioned (
390+
id BIGINT,
391+
name STRING,
392+
ts TIMESTAMP
393+
) USING ICEBERG PARTITIONED BY (year(ts));
394+
395+
INSERT INTO year_partitioned VALUES
396+
(1, 'y1', TIMESTAMP '2023-01-01 00:00:00'),
397+
(2, 'y2', TIMESTAMP '2023-06-15 12:00:00'),
398+
(3, 'y3', TIMESTAMP '2023-12-31 23:59:59'),
399+
(4, 'y4', TIMESTAMP '2024-01-01 00:00:00'),
400+
(5, 'y5', TIMESTAMP '2024-06-15 12:00:00'),
401+
(6, 'y6', TIMESTAMP '2024-12-31 23:59:59'),
402+
(7, 'y7', TIMESTAMP '2025-01-01 00:00:00'),
403+
(8, 'null', NULL);
404+
405+
-- Month transform by TIMESTAMP
406+
CREATE TABLE month_partitioned (
407+
id BIGINT,
408+
name STRING,
409+
ts TIMESTAMP
410+
) USING ICEBERG PARTITIONED BY (month(ts));
411+
412+
INSERT INTO month_partitioned VALUES
413+
(1, 'm1', TIMESTAMP '2024-01-01 00:00:00'),
414+
(2, 'm2', TIMESTAMP '2024-01-15 12:00:00'),
415+
(3, 'm3', TIMESTAMP '2024-01-31 23:59:59'),
416+
(4, 'm4', TIMESTAMP '2024-02-01 00:00:00'),
417+
(5, 'm5', TIMESTAMP '2024-02-15 12:00:00'),
418+
(6, 'm6', TIMESTAMP '2024-02-29 23:59:59'),
419+
(7, 'm7', TIMESTAMP '2024-12-01 00:00:00'),
420+
(8, 'm8', TIMESTAMP '2024-12-31 23:59:59'),
421+
(9, 'null', NULL);
422+
423+
-- Hour transform by TIMESTAMP
424+
CREATE TABLE hour_partitioned (
425+
id BIGINT,
426+
name STRING,
427+
ts TIMESTAMP
428+
) USING ICEBERG PARTITIONED BY (hour(ts));
429+
430+
INSERT INTO hour_partitioned VALUES
431+
(1, 'h1', TIMESTAMP '2024-01-15 10:00:00'),
432+
(2, 'h2', TIMESTAMP '2024-01-15 10:30:00'),
433+
(3, 'h3', TIMESTAMP '2024-01-15 10:59:59'),
434+
(4, 'h4', TIMESTAMP '2024-01-15 11:00:00'),
435+
(5, 'h5', TIMESTAMP '2024-01-15 11:30:00'),
436+
(6, 'h6', TIMESTAMP '2024-01-15 23:00:00'),
437+
(7, 'h7', TIMESTAMP '2024-01-15 23:59:59'),
438+
(8, 'h8', TIMESTAMP '2024-01-16 00:00:00'),
439+
(9, 'null', NULL);
440+
441+
-- Create _copy tables for write testing
442+
CREATE TABLE day_partitioned_copy AS
443+
SELECT * FROM day_partitioned;
444+
445+
CREATE TABLE year_partitioned_copy AS
446+
SELECT * FROM year_partitioned;
447+
448+
CREATE TABLE month_partitioned_copy AS
449+
SELECT * FROM month_partitioned;
450+
451+
CREATE TABLE hour_partitioned_copy AS
452+
SELECT * FROM hour_partitioned;
453+
367454
-- create table for testing query partitions with snapshot has been expired
368455

369456
CREATE TABLE test_partitions_with_expired_snapshot (

fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,11 +609,47 @@ public static Type icebergTypeToDorisType(org.apache.iceberg.types.Type type) {
609609
}
610610
}
611611

612-
public static Map<String, String> getPartitionInfoMap(PartitionData partitionData, String timeZone) {
612+
/**
613+
* Get partition info map for identity partitions only, considering partition
614+
* evolution.
615+
* For non-identity partitions (e.g., day, bucket, truncate), returns null to
616+
* skip
617+
* dynamic partition pruning.
618+
*
619+
* @param partitionData The partition data from the file
620+
* @param partitionSpec The partition spec corresponding to the file's specId
621+
* (required)
622+
* @param timeZone The time zone for timestamp serialization
623+
* @return Map of partition field name to partition value string, or null if
624+
* there are non-identity partitions
625+
*/
626+
public static Map<String, String> getPartitionInfoMap(PartitionData partitionData, PartitionSpec partitionSpec,
627+
String timeZone) {
613628
Map<String, String> partitionInfoMap = new HashMap<>();
614629
List<NestedField> fields = partitionData.getPartitionType().asNestedType().fields();
630+
631+
// Check if all partition fields are identity transform
632+
// If any field is not identity, return null to skip dynamic partition pruning
633+
List<PartitionField> partitionFields = partitionSpec.fields();
634+
Preconditions.checkArgument(fields.size() == partitionFields.size(),
635+
"PartitionData fields size does not match PartitionSpec fields size");
636+
615637
for (int i = 0; i < fields.size(); i++) {
616638
NestedField field = fields.get(i);
639+
PartitionField partitionField = partitionFields.get(i);
640+
641+
// Only process identity transform partitions
642+
// For other transforms (day, bucket, truncate, etc.), skip dynamic partition
643+
// pruning
644+
if (!partitionField.transform().isIdentity()) {
645+
if (LOG.isDebugEnabled()) {
646+
LOG.debug(
647+
"Skip dynamic partition pruning for non-identity partition field: {} with transform: {}",
648+
field.name(), partitionField.transform().toString());
649+
}
650+
return null;
651+
}
652+
617653
Object value = partitionData.get(i);
618654
try {
619655
String partitionString = serializePartitionValue(field.type(), value, timeZone);

fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import org.apache.iceberg.ManifestFile;
6565
import org.apache.iceberg.MetadataColumns;
6666
import org.apache.iceberg.PartitionData;
67+
import org.apache.iceberg.PartitionSpec;
6768
import org.apache.iceberg.Snapshot;
6869
import org.apache.iceberg.Table;
6970
import org.apache.iceberg.TableScan;
@@ -363,10 +364,19 @@ private Split createIcebergSplit(FileScanTask fileScanTask) {
363364
if (isPartitionedTable) {
364365
PartitionData partitionData = (PartitionData) fileScanTask.file().partition();
365366
if (sessionVariable.isEnableRuntimeFilterPartitionPrune()) {
366-
// If the partition data is not in the map, we need to calculate the partition
367-
Map<String, String> partitionInfoMap = partitionMapInfos.computeIfAbsent(partitionData, k -> {
368-
return IcebergUtils.getPartitionInfoMap(partitionData, sessionVariable.getTimeZone());
369-
});
367+
// Get specId and corresponding PartitionSpec to handle partition evolution
368+
int specId = fileScanTask.file().specId();
369+
PartitionSpec partitionSpec = icebergTable.specs().get(specId);
370+
371+
Preconditions.checkNotNull(partitionSpec, "Partition spec with specId %s not found for table %s",
372+
specId, icebergTable.name());
373+
Map<String, String> partitionInfoMap = partitionMapInfos.computeIfAbsent(
374+
partitionData, k -> {
375+
return IcebergUtils.getPartitionInfoMap(partitionData, partitionSpec,
376+
sessionVariable.getTimeZone());
377+
});
378+
// Only set partition values if all partitions are identity transform
379+
// For non-identity partitions, getPartitionInfoMap returns null to skip dynamic partition pruning
370380
if (partitionInfoMap != null) {
371381
split.setIcebergPartitionValues(partitionInfoMap);
372382
}

regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,60 @@
7171
-- !trunc_decimal_in --
7272
2
7373

74+
-- !day_eq --
75+
1
76+
77+
-- !day_in --
78+
2
79+
80+
-- !day_range_ge --
81+
7
82+
83+
-- !day_range_lt --
84+
3
85+
86+
-- !day_range_between --
87+
3
88+
89+
-- !day_range_multi --
90+
5
91+
92+
-- !year_eq --
93+
1
94+
95+
-- !year_in --
96+
2
97+
98+
-- !year_range --
99+
3
100+
101+
-- !year_range_cross --
102+
3
103+
104+
-- !month_eq --
105+
1
106+
107+
-- !month_in --
108+
2
109+
110+
-- !month_range --
111+
3
112+
113+
-- !month_range_multi --
114+
6
115+
116+
-- !hour_eq --
117+
1
118+
119+
-- !hour_in --
120+
2
121+
122+
-- !hour_range --
123+
3
124+
125+
-- !hour_range_multi --
126+
5
127+
74128
-- !bucket_int_eq --
75129
1
76130

@@ -143,3 +197,57 @@
143197
-- !trunc_decimal_in --
144198
2
145199

200+
-- !day_eq --
201+
1
202+
203+
-- !day_in --
204+
2
205+
206+
-- !day_range_ge --
207+
7
208+
209+
-- !day_range_lt --
210+
3
211+
212+
-- !day_range_between --
213+
3
214+
215+
-- !day_range_multi --
216+
5
217+
218+
-- !year_eq --
219+
1
220+
221+
-- !year_in --
222+
2
223+
224+
-- !year_range --
225+
3
226+
227+
-- !year_range_cross --
228+
3
229+
230+
-- !month_eq --
231+
1
232+
233+
-- !month_in --
234+
2
235+
236+
-- !month_range --
237+
3
238+
239+
-- !month_range_multi --
240+
6
241+
242+
-- !hour_eq --
243+
1
244+
245+
-- !hour_in --
246+
2
247+
248+
-- !hour_range --
249+
3
250+
251+
-- !hour_range_multi --
252+
5
253+

regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,104 @@ suite("test_iceberg_runtime_filter_partition_pruning_transform", "p0,external,do
201201
group by partition_key having count(*) > 0
202202
order by partition_key desc limit 2);
203203
"""
204+
205+
// Time transform partitions (day/year/month/hour)
206+
// Note: Bucket and Truncate transforms are not supported for runtime filter partition pruning,
207+
// but Day/Year/Month/Hour transforms are supported.
208+
209+
// Day transform tests
210+
qt_day_eq """
211+
select count(*) from day_partitioned where ts =
212+
(select ts from day_partitioned
213+
group by ts having count(*) > 0
214+
order by ts desc limit 1);
215+
"""
216+
qt_day_in """
217+
select count(*) from day_partitioned where ts in
218+
(select ts from day_partitioned
219+
group by ts having count(*) > 0
220+
order by ts desc limit 2);
221+
"""
222+
qt_day_range_ge """
223+
select count(*) from day_partitioned where ts >= '2024-01-15 00:00:00';
224+
"""
225+
qt_day_range_lt """
226+
select count(*) from day_partitioned where ts < '2024-01-16 00:00:00';
227+
"""
228+
qt_day_range_between """
229+
select count(*) from day_partitioned where ts >= '2024-01-15 00:00:00'
230+
and ts < '2024-01-16 00:00:00';
231+
"""
232+
qt_day_range_multi """
233+
select count(*) from day_partitioned where ts >= '2024-01-15 00:00:00'
234+
and ts < '2024-01-17 00:00:00';
235+
"""
236+
237+
// Year transform tests
238+
qt_year_eq """
239+
select count(*) from year_partitioned where ts =
240+
(select ts from year_partitioned
241+
group by ts having count(*) > 0
242+
order by ts desc limit 1);
243+
"""
244+
qt_year_in """
245+
select count(*) from year_partitioned where ts in
246+
(select ts from year_partitioned
247+
group by ts having count(*) > 0
248+
order by ts desc limit 2);
249+
"""
250+
qt_year_range """
251+
select count(*) from year_partitioned where ts >= '2024-01-01 00:00:00'
252+
and ts < '2025-01-01 00:00:00';
253+
"""
254+
qt_year_range_cross """
255+
select count(*) from year_partitioned where ts >= '2023-06-01 00:00:00'
256+
and ts < '2024-06-01 00:00:00';
257+
"""
258+
259+
// Month transform tests
260+
qt_month_eq """
261+
select count(*) from month_partitioned where ts =
262+
(select ts from month_partitioned
263+
group by ts having count(*) > 0
264+
order by ts desc limit 1);
265+
"""
266+
qt_month_in """
267+
select count(*) from month_partitioned where ts in
268+
(select ts from month_partitioned
269+
group by ts having count(*) > 0
270+
order by ts desc limit 2);
271+
"""
272+
qt_month_range """
273+
select count(*) from month_partitioned where ts >= '2024-01-01 00:00:00'
274+
and ts < '2024-02-01 00:00:00';
275+
"""
276+
qt_month_range_multi """
277+
select count(*) from month_partitioned where ts >= '2024-01-01 00:00:00'
278+
and ts < '2024-03-01 00:00:00';
279+
"""
280+
281+
// Hour transform tests
282+
qt_hour_eq """
283+
select count(*) from hour_partitioned where ts =
284+
(select ts from hour_partitioned
285+
group by ts having count(*) > 0
286+
order by ts desc limit 1);
287+
"""
288+
qt_hour_in """
289+
select count(*) from hour_partitioned where ts in
290+
(select ts from hour_partitioned
291+
group by ts having count(*) > 0
292+
order by ts desc limit 2);
293+
"""
294+
qt_hour_range """
295+
select count(*) from hour_partitioned where ts >= '2024-01-15 10:00:00'
296+
and ts < '2024-01-15 11:00:00';
297+
"""
298+
qt_hour_range_multi """
299+
select count(*) from hour_partitioned where ts >= '2024-01-15 10:00:00'
300+
and ts < '2024-01-15 12:00:00';
301+
"""
204302
}
205303
try {
206304
sql """ set time_zone = 'Asia/Shanghai'; """

0 commit comments

Comments
 (0)