Skip to content

Commit 24937de

Browse files
committed
[fix](datatype)Fix unaligned memory access in read_column_from_arrow
1 parent d990c5e commit 24937de

File tree

6 files changed

+46
-12
lines changed

6 files changed

+46
-12
lines changed

be/src/vec/data_types/serde/data_type_array_serde.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,11 +326,17 @@ Status DataTypeArraySerDe::read_column_from_arrow(IColumn& column, const arrow::
326326
auto arrow_offsets_array = concrete_array->offsets();
327327
auto* arrow_offsets = dynamic_cast<arrow::Int32Array*>(arrow_offsets_array.get());
328328
auto prev_size = offsets_data.back();
329-
auto arrow_nested_start_offset = arrow_offsets->Value(start);
330-
auto arrow_nested_end_offset = arrow_offsets->Value(end);
329+
int32_t arrow_nested_start_offset;
330+
int32_t arrow_nested_end_offset;
331+
const auto* offsets_raw_data = arrow_offsets->raw_values();
332+
memcpy(&arrow_nested_start_offset, offsets_raw_data + start, sizeof(int32_t));
333+
memcpy(&arrow_nested_end_offset, offsets_raw_data + end, sizeof(int32_t));
334+
331335
for (auto i = start + 1; i < end + 1; ++i) {
336+
int32_t current_offset;
337+
memcpy(&current_offset, offsets_raw_data + i, sizeof(int32_t));
332338
// convert to doris offset, start from offsets.back()
333-
offsets_data.emplace_back(prev_size + arrow_offsets->Value(i) - arrow_nested_start_offset);
339+
offsets_data.emplace_back(prev_size + current_offset - arrow_nested_start_offset);
334340
}
335341
return nested_serde->read_column_from_arrow(
336342
column_array.get_data(), concrete_array->values().get(), arrow_nested_start_offset,

be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,11 @@ Status DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column,
398398
}
399399
}
400400
for (auto value_i = start; value_i < end; ++value_i) {
401-
auto utc_epoch = static_cast<UInt64>(concrete_array->Value(value_i));
401+
int64_t utc_epoch_raw;
402+
const auto* raw_data_ptr = concrete_array->raw_values() + value_i;
403+
memcpy(&utc_epoch_raw, raw_data_ptr, sizeof(int64_t));
404+
405+
auto utc_epoch = static_cast<UInt64>(utc_epoch_raw);
402406

403407
DateV2Value<DateTimeV2ValueType> v;
404408
// convert second

be/src/vec/data_types/serde/data_type_datev2_serde.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,12 @@ Status DataTypeDateV2SerDe::read_column_from_arrow(IColumn& column, const arrow:
115115
auto& col_data = static_cast<ColumnDateV2&>(column).get_data();
116116
const auto* concrete_array = dynamic_cast<const arrow::Date32Array*>(arrow_array);
117117
for (auto value_i = start; value_i < end; ++value_i) {
118+
int32_t date_value;
119+
const auto* raw_data_ptr = concrete_array->raw_values() + value_i;
120+
memcpy(&date_value, raw_data_ptr, sizeof(int32_t));
121+
118122
DateV2Value<DateV2ValueType> v;
119-
v.get_date_from_daynr(concrete_array->Value(value_i) + date_threshold);
123+
v.get_date_from_daynr(date_value + date_threshold);
120124
col_data.emplace_back(binary_cast<DateV2Value<DateV2ValueType>, UInt32>(v));
121125
}
122126
return Status::OK();

be/src/vec/data_types/serde/data_type_decimal_serde.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,8 +337,10 @@ Status DataTypeDecimalSerDe<T>::read_column_from_arrow(IColumn& column,
337337
} else if constexpr (T == TYPE_DECIMAL32 || T == TYPE_DECIMAL64 || T == TYPE_DECIMAL128I) {
338338
const auto* concrete_array = dynamic_cast<const arrow::DecimalArray*>(arrow_array);
339339
for (auto value_i = start; value_i < end; ++value_i) {
340-
column_data.emplace_back(
341-
*reinterpret_cast<const FieldType*>(concrete_array->Value(value_i)));
340+
const auto* value = concrete_array->Value(value_i);
341+
FieldType decimal_value;
342+
memcpy(&decimal_value, value, sizeof(FieldType));
343+
column_data.emplace_back(decimal_value);
342344
}
343345
} else if constexpr (T == TYPE_DECIMAL256) {
344346
const auto* concrete_array = dynamic_cast<const arrow::Decimal256Array*>(arrow_array);

be/src/vec/data_types/serde/data_type_number_serde.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,15 @@ Status DataTypeNumberSerDe<T>::read_column_from_arrow(IColumn& column,
221221

222222
for (size_t offset_i = start; offset_i < end; ++offset_i) {
223223
if (!concrete_array->IsNull(offset_i)) {
224-
const auto* raw_data = buffer->data() + concrete_array->value_offset(offset_i);
225-
const auto raw_data_len = concrete_array->value_length(offset_i);
224+
int32_t start_offset;
225+
int32_t end_offset;
226+
const auto* offsets_data = concrete_array->value_offsets()->data();
227+
memcpy(&start_offset, offsets_data + offset_i * sizeof(int32_t), sizeof(int32_t));
228+
memcpy(&end_offset, offsets_data + (offset_i + 1) * sizeof(int32_t),
229+
sizeof(int32_t));
230+
231+
const auto* raw_data = buffer->data() + start_offset;
232+
const auto raw_data_len = end_offset - start_offset;
226233

227234
if (raw_data_len == 0) {
228235
col_data.emplace_back(Int128()); // Int128() is NULL

be/src/vec/data_types/serde/data_type_string_serde.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,12 +262,23 @@ Status DataTypeStringSerDeBase<ColumnType>::read_column_from_arrow(
262262
arrow_array->type_id() == arrow::Type::BINARY) {
263263
const auto* concrete_array = dynamic_cast<const arrow::BinaryArray*>(arrow_array);
264264
std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
265+
const auto offsets_buffer = concrete_array->value_offsets();
266+
const uint8_t* offsets_data = offsets_buffer->data();
265267

266268
for (auto offset_i = start; offset_i < end; ++offset_i) {
267269
if (!concrete_array->IsNull(offset_i)) {
268-
const auto* raw_data = buffer->data() + concrete_array->value_offset(offset_i);
270+
int32_t start_offset;
271+
int32_t end_offset;
272+
273+
memcpy(&start_offset, offsets_data + offset_i * sizeof(int32_t), sizeof(int32_t));
274+
memcpy(&end_offset, offsets_data + (offset_i + 1) * sizeof(int32_t),
275+
sizeof(int32_t));
276+
277+
int32_t length = end_offset - start_offset;
278+
const auto* raw_data = buffer->data() + start_offset;
279+
269280
assert_cast<ColumnType&>(column).insert_data(
270-
(char*)raw_data, concrete_array->value_length(offset_i));
281+
reinterpret_cast<const char*>(raw_data), length);
271282
} else {
272283
assert_cast<ColumnType&>(column).insert_default();
273284
}
@@ -451,4 +462,4 @@ template class DataTypeStringSerDeBase<ColumnString>;
451462
template class DataTypeStringSerDeBase<ColumnString64>;
452463
template class DataTypeStringSerDeBase<ColumnFixedLengthObject>;
453464

454-
} // namespace doris::vectorized
465+
} // namespace doris::vectorized

0 commit comments

Comments
 (0)