Skip to content

Commit 94c865d

Browse files
committed
update uuid
1 parent e949559 commit 94c865d

File tree

7 files changed

+396
-3
lines changed

7 files changed

+396
-3
lines changed

be/src/vec/exec/format/parquet/parquet_column_convert.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,18 @@ std::unique_ptr<PhysicalToLogicalConverter> PhysicalToLogicalConverter::get_conv
260260
}
261261
} else if (src_logical_primitive == TYPE_VARBINARY) {
262262
// src_physical_type is varbinary and dst_logical_type is string
263-
if (is_string_type(remove_nullable(dst_logical_type)->get_primitive_type())) {
263+
if (is_string_type(dst_logical_type->get_primitive_type())) {
264+
DCHECK(src_physical_type == tparquet::Type::BYTE_ARRAY) << src_physical_type;
264265
physical_converter = std::make_unique<VarBinaryConverter>();
265266
} else {
266-
physical_converter = std::make_unique<ConsistentPhysicalConverter>();
267+
if (src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
268+
DCHECK(parquet_schema.logicalType.__isset.UUID) << parquet_schema.name;
269+
physical_converter =
270+
std::make_unique<UUIDVarBinaryConverter>(parquet_schema.type_length);
271+
} else {
272+
DCHECK(src_physical_type == tparquet::Type::BYTE_ARRAY) << src_physical_type;
273+
physical_converter = std::make_unique<ConsistentPhysicalConverter>();
274+
}
267275
}
268276
} else {
269277
physical_converter =

be/src/vec/exec/format/parquet/parquet_column_convert.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <gen_cpp/parquet_types.h>
2121

2222
#include "common/cast_set.h"
23+
#include "vec/columns/column_varbinary.h"
2324
#include "vec/core/extended_types.h"
2425
#include "vec/core/field.h"
2526
#include "vec/core/types.h"
@@ -390,6 +391,48 @@ class VarBinaryConverter : public PhysicalToLogicalConverter {
390391
}
391392
};
392393

394+
class UUIDVarBinaryConverter : public PhysicalToLogicalConverter {
395+
public:
396+
UUIDVarBinaryConverter(int type_length) : _type_length(type_length) {}
397+
398+
Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override {
399+
DCHECK(!is_column_const(*src_physical_col)) << src_physical_col->dump_structure();
400+
DCHECK(!is_column_const(*src_logical_column)) << src_logical_column->dump_structure();
401+
const ColumnUInt8* uint8_col = nullptr;
402+
if (is_column_nullable(*src_physical_col)) {
403+
const auto& nullable =
404+
assert_cast<const vectorized::ColumnNullable*>(src_physical_col.get());
405+
uint8_col = &assert_cast<const ColumnUInt8&>(nullable->get_nested_column());
406+
} else {
407+
uint8_col = &assert_cast<const ColumnUInt8&>(*src_physical_col);
408+
}
409+
410+
MutableColumnPtr to_col = nullptr;
411+
// nullmap flag seems have been handled in upper level
412+
if (src_logical_column->is_nullable()) {
413+
const auto* nullable =
414+
assert_cast<const vectorized::ColumnNullable*>(src_logical_column.get());
415+
to_col = nullable->get_nested_column_ptr()->assume_mutable();
416+
} else {
417+
to_col = src_logical_column->assume_mutable();
418+
}
419+
auto* to_varbinary_column = assert_cast<ColumnVarbinary*>(to_col.get());
420+
size_t length = uint8_col->size();
421+
size_t num_values = length / _type_length;
422+
const auto* ptr = uint8_col->get_data().data();
423+
424+
for (int i = 0; i < num_values; ++i) {
425+
auto offset = i * _type_length;
426+
const char* data_ptr = reinterpret_cast<const char*>(ptr + offset);
427+
to_varbinary_column->insert_data(data_ptr, _type_length);
428+
}
429+
return Status::OK();
430+
}
431+
432+
private:
433+
int _type_length;
434+
};
435+
393436
template <PrimitiveType DecimalPType>
394437
class FixedSizeToDecimal : public PhysicalToLogicalConverter {
395438
public:

be/src/vec/exec/format/parquet/schema_desc.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,9 @@ std::pair<DataTypePtr, bool> FieldDescriptor::convert_to_doris_type(
301301
TYPE_DATETIMEV2, nullable, 0, logicalType.TIMESTAMP.unit.__isset.MILLIS ? 3 : 6);
302302
} else if (logicalType.__isset.JSON) {
303303
ans.first = DataTypeFactory::instance().create_data_type(TYPE_STRING, nullable);
304+
} else if (logicalType.__isset.UUID) {
305+
ans.first =
306+
DataTypeFactory::instance().create_data_type(TYPE_VARBINARY, nullable, -1, -1, 16);
304307
} else {
305308
throw Exception(Status::InternalError("Not supported parquet logicalType"));
306309
}

be/test/vec/columns/column_varbinary_test.cpp

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,4 +627,257 @@ TEST_F(ColumnVarbinaryTest, GetPermutationAscDescIgnoreLimit) {
627627
}
628628
}
629629

630+
TEST_F(ColumnVarbinaryTest, InsertManyStrings) {
631+
auto col = ColumnVarbinary::create();
632+
633+
// Test 1: Insert empty array
634+
{
635+
std::vector<StringRef> empty_refs;
636+
col->insert_many_strings(empty_refs.data(), empty_refs.size());
637+
EXPECT_EQ(col->size(), 0U);
638+
}
639+
640+
// Test 2: Insert single string
641+
{
642+
std::string s1 = "hello";
643+
StringRef ref1(s1.data(), s1.size());
644+
col->insert_many_strings(&ref1, 1);
645+
EXPECT_EQ(col->size(), 1U);
646+
auto data = col->get_data_at(0);
647+
EXPECT_EQ(data.size, 5U);
648+
EXPECT_EQ(memcmp(data.data, "hello", 5), 0);
649+
}
650+
651+
// Test 3: Insert multiple inline strings (size <= kInlineSize)
652+
{
653+
std::string s2 = "abc";
654+
std::string s3 = "def";
655+
std::string s4 = make_bytes(doris::StringView::kInlineSize, 0xAA);
656+
std::vector<StringRef> refs = {StringRef(s2.data(), s2.size()),
657+
StringRef(s3.data(), s3.size()),
658+
StringRef(s4.data(), s4.size())};
659+
col->insert_many_strings(refs.data(), refs.size());
660+
EXPECT_EQ(col->size(), 4U); // 1 from test 2 + 3 new
661+
662+
auto data1 = col->get_data_at(1);
663+
EXPECT_EQ(data1.size, 3U);
664+
EXPECT_EQ(memcmp(data1.data, "abc", 3), 0);
665+
666+
auto data2 = col->get_data_at(2);
667+
EXPECT_EQ(data2.size, 3U);
668+
EXPECT_EQ(memcmp(data2.data, "def", 3), 0);
669+
670+
auto data3 = col->get_data_at(3);
671+
EXPECT_EQ(data3.size, doris::StringView::kInlineSize);
672+
EXPECT_EQ(memcmp(data3.data, s4.data(), s4.size()), 0);
673+
}
674+
675+
// Test 4: Insert multiple large strings (size > kInlineSize)
676+
{
677+
std::string large1 = make_bytes(doris::StringView::kInlineSize + 10, 0x11);
678+
std::string large2 = make_bytes(doris::StringView::kInlineSize + 20, 0x22);
679+
std::string large3 = make_bytes(doris::StringView::kInlineSize + 30, 0x33);
680+
681+
std::vector<StringRef> large_refs = {StringRef(large1.data(), large1.size()),
682+
StringRef(large2.data(), large2.size()),
683+
StringRef(large3.data(), large3.size())};
684+
size_t before_size = col->size();
685+
col->insert_many_strings(large_refs.data(), large_refs.size());
686+
EXPECT_EQ(col->size(), before_size + 3);
687+
688+
auto data_large1 = col->get_data_at(before_size);
689+
EXPECT_EQ(data_large1.size, large1.size());
690+
EXPECT_EQ(memcmp(data_large1.data, large1.data(), large1.size()), 0);
691+
692+
auto data_large2 = col->get_data_at(before_size + 1);
693+
EXPECT_EQ(data_large2.size, large2.size());
694+
EXPECT_EQ(memcmp(data_large2.data, large2.data(), large2.size()), 0);
695+
696+
auto data_large3 = col->get_data_at(before_size + 2);
697+
EXPECT_EQ(data_large3.size, large3.size());
698+
EXPECT_EQ(memcmp(data_large3.data, large3.data(), large3.size()), 0);
699+
}
700+
701+
// Test 5: Insert strings with null bytes
702+
{
703+
std::string null_str1 = std::string("abc\0def", 7);
704+
std::string null_str2 = std::string("\0\0\0", 3);
705+
std::vector<StringRef> null_refs = {StringRef(null_str1.data(), null_str1.size()),
706+
StringRef(null_str2.data(), null_str2.size())};
707+
size_t before_size = col->size();
708+
col->insert_many_strings(null_refs.data(), null_refs.size());
709+
EXPECT_EQ(col->size(), before_size + 2);
710+
711+
auto data_null1 = col->get_data_at(before_size);
712+
EXPECT_EQ(data_null1.size, 7U);
713+
EXPECT_EQ(memcmp(data_null1.data, null_str1.data(), 7), 0);
714+
715+
auto data_null2 = col->get_data_at(before_size + 1);
716+
EXPECT_EQ(data_null2.size, 3U);
717+
EXPECT_EQ(memcmp(data_null2.data, null_str2.data(), 3), 0);
718+
}
719+
720+
// Test 6: Insert mixed inline and non-inline strings
721+
{
722+
std::string small = "xy";
723+
std::string medium = make_bytes(doris::StringView::kInlineSize, 0xBB);
724+
std::string large = make_bytes(doris::StringView::kInlineSize + 50, 0xCC);
725+
std::vector<StringRef> mixed_refs = {StringRef(small.data(), small.size()),
726+
StringRef(medium.data(), medium.size()),
727+
StringRef(large.data(), large.size())};
728+
size_t before_size = col->size();
729+
col->insert_many_strings(mixed_refs.data(), mixed_refs.size());
730+
EXPECT_EQ(col->size(), before_size + 3);
731+
732+
auto data_small = col->get_data_at(before_size);
733+
EXPECT_EQ(data_small.size, 2U);
734+
EXPECT_EQ(memcmp(data_small.data, "xy", 2), 0);
735+
736+
auto data_medium = col->get_data_at(before_size + 1);
737+
EXPECT_EQ(data_medium.size, doris::StringView::kInlineSize);
738+
739+
auto data_large = col->get_data_at(before_size + 2);
740+
EXPECT_EQ(data_large.size, large.size());
741+
EXPECT_EQ(memcmp(data_large.data, large.data(), large.size()), 0);
742+
}
743+
744+
// Test 7: Insert UUID-like binary data (16 bytes)
745+
{
746+
std::string uuid1 = make_bytes(16, 0x55);
747+
std::string uuid2 = make_bytes(16, 0x12);
748+
std::vector<StringRef> uuid_refs = {StringRef(uuid1.data(), uuid1.size()),
749+
StringRef(uuid2.data(), uuid2.size())};
750+
size_t before_size = col->size();
751+
col->insert_many_strings(uuid_refs.data(), uuid_refs.size());
752+
EXPECT_EQ(col->size(), before_size + 2);
753+
754+
auto data_uuid1 = col->get_data_at(before_size);
755+
EXPECT_EQ(data_uuid1.size, 16U);
756+
EXPECT_EQ(memcmp(data_uuid1.data, uuid1.data(), 16), 0);
757+
758+
auto data_uuid2 = col->get_data_at(before_size + 1);
759+
EXPECT_EQ(data_uuid2.size, 16U);
760+
EXPECT_EQ(memcmp(data_uuid2.data, uuid2.data(), 16), 0);
761+
}
762+
}
763+
764+
TEST_F(ColumnVarbinaryTest, InsertManyStringsOverflow) {
765+
auto col = ColumnVarbinary::create();
766+
767+
// Test 1: Insert with max_length larger than actual strings (no overflow)
768+
{
769+
std::string s1 = "hello";
770+
std::string s2 = "world";
771+
std::vector<StringRef> refs = {StringRef(s1.data(), s1.size()),
772+
StringRef(s2.data(), s2.size())};
773+
col->insert_many_strings_overflow(refs.data(), refs.size(), 100);
774+
EXPECT_EQ(col->size(), 2U);
775+
776+
auto data1 = col->get_data_at(0);
777+
EXPECT_EQ(data1.size, 5U);
778+
EXPECT_EQ(memcmp(data1.data, "hello", 5), 0);
779+
780+
auto data2 = col->get_data_at(1);
781+
EXPECT_EQ(data2.size, 5U);
782+
EXPECT_EQ(memcmp(data2.data, "world", 5), 0);
783+
}
784+
785+
// Test 2: Insert with max_length equal to string length (exact fit)
786+
{
787+
std::string s3 = "test123";
788+
StringRef ref3(s3.data(), s3.size());
789+
col->insert_many_strings_overflow(&ref3, 1, 7);
790+
EXPECT_EQ(col->size(), 3U);
791+
792+
auto data3 = col->get_data_at(2);
793+
EXPECT_EQ(data3.size, 7U);
794+
EXPECT_EQ(memcmp(data3.data, "test123", 7), 0);
795+
}
796+
797+
// Test 3: Insert large strings with max_length
798+
// Note: Current implementation doesn't actually truncate, it just calls insert_many_strings
799+
// This test verifies the current behavior
800+
{
801+
std::string large = make_bytes(doris::StringView::kInlineSize + 100, 0xAA);
802+
StringRef ref_large(large.data(), large.size());
803+
size_t before_size = col->size();
804+
col->insert_many_strings_overflow(&ref_large, 1, 50);
805+
EXPECT_EQ(col->size(), before_size + 1);
806+
807+
auto data_large = col->get_data_at(before_size);
808+
// Current implementation doesn't truncate, so full size is preserved
809+
EXPECT_EQ(data_large.size, large.size());
810+
EXPECT_EQ(memcmp(data_large.data, large.data(), large.size()), 0);
811+
}
812+
813+
// Test 4: Insert multiple strings with overflow parameter
814+
{
815+
std::string s4 = make_bytes(20, 0x11);
816+
std::string s5 = make_bytes(30, 0x22);
817+
std::string s6 = make_bytes(40, 0x33);
818+
std::vector<StringRef> refs = {StringRef(s4.data(), s4.size()),
819+
StringRef(s5.data(), s5.size()),
820+
StringRef(s6.data(), s6.size())};
821+
size_t before_size = col->size();
822+
col->insert_many_strings_overflow(refs.data(), refs.size(), 100);
823+
EXPECT_EQ(col->size(), before_size + 3);
824+
825+
// Verify all strings are inserted correctly
826+
auto data4 = col->get_data_at(before_size);
827+
EXPECT_EQ(data4.size, 20U);
828+
EXPECT_EQ(memcmp(data4.data, s4.data(), 20), 0);
829+
830+
auto data5 = col->get_data_at(before_size + 1);
831+
EXPECT_EQ(data5.size, 30U);
832+
EXPECT_EQ(memcmp(data5.data, s5.data(), 30), 0);
833+
834+
auto data6 = col->get_data_at(before_size + 2);
835+
EXPECT_EQ(data6.size, 40U);
836+
EXPECT_EQ(memcmp(data6.data, s6.data(), 40), 0);
837+
}
838+
839+
// Test 5: Insert binary data (like UUID) with overflow
840+
{
841+
std::string uuid = make_bytes(16, 0x55);
842+
StringRef uuid_ref(uuid.data(), uuid.size());
843+
size_t before_size = col->size();
844+
col->insert_many_strings_overflow(&uuid_ref, 1, 32);
845+
EXPECT_EQ(col->size(), before_size + 1);
846+
847+
auto data_uuid = col->get_data_at(before_size);
848+
EXPECT_EQ(data_uuid.size, 16U);
849+
EXPECT_EQ(memcmp(data_uuid.data, uuid.data(), 16), 0);
850+
}
851+
852+
// Test 6: Insert empty strings with max_length
853+
{
854+
std::string empty1;
855+
std::string empty2;
856+
std::vector<StringRef> empty_refs = {StringRef(empty1.data(), empty1.size()),
857+
StringRef(empty2.data(), empty2.size())};
858+
size_t before_size = col->size();
859+
col->insert_many_strings_overflow(empty_refs.data(), empty_refs.size(), 10);
860+
EXPECT_EQ(col->size(), before_size + 2);
861+
862+
auto data_empty1 = col->get_data_at(before_size);
863+
EXPECT_EQ(data_empty1.size, 0U);
864+
865+
auto data_empty2 = col->get_data_at(before_size + 1);
866+
EXPECT_EQ(data_empty2.size, 0U);
867+
}
868+
869+
// Test 7: Insert strings with null bytes and overflow parameter
870+
{
871+
std::string null_data = std::string("abc\0\0\0def", 9);
872+
StringRef null_ref(null_data.data(), null_data.size());
873+
size_t before_size = col->size();
874+
col->insert_many_strings_overflow(&null_ref, 1, 20);
875+
EXPECT_EQ(col->size(), before_size + 1);
876+
877+
auto data_null = col->get_data_at(before_size);
878+
EXPECT_EQ(data_null.size, 9U);
879+
EXPECT_EQ(memcmp(data_null.data, null_data.data(), 9), 0);
880+
}
881+
}
882+
630883
} // namespace doris::vectorized

fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,9 @@ private static Type icebergPrimitiveTypeToDorisType(org.apache.iceberg.types.Typ
566566
case DOUBLE:
567567
return Type.DOUBLE;
568568
case STRING:
569-
case UUID:
570569
return Type.STRING;
570+
case UUID:
571+
return ScalarType.createVarbinaryType(16);
571572
case BINARY:
572573
return enableMappingVarbinary ? ScalarType.createVarbinaryType(VarBinaryType.MAX_VARBINARY_LENGTH)
573574
: Type.STRING;
Binary file not shown.

0 commit comments

Comments
 (0)