diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 3a7775d8130876..7d44ee82e2c0da 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -167,13 +167,19 @@ std::string get_parser_dict_compression_from_properties( } } -std::string get_custom_analyzer_string_from_properties( +std::string get_analyzer_name_from_properties( const std::map& properties) { - if (properties.find(INVERTED_INDEX_CUSTOM_ANALYZER_KEY) != properties.end()) { - return properties.at(INVERTED_INDEX_CUSTOM_ANALYZER_KEY); - } else { - return ""; + auto it = properties.find(INVERTED_INDEX_ANALYZER_NAME_KEY); + if (it != properties.end() && !it->second.empty()) { + return it->second; } + + it = properties.find(INVERTED_INDEX_NORMALIZER_NAME_KEY); + if (it != properties.end() && !it->second.empty()) { + return it->second; + } + + return ""; } } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 075a4f8e543089..b7f547882ca7ad 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -46,7 +46,7 @@ enum class InvertedIndexParserType { using CharFilterMap = std::map; struct InvertedIndexCtx { - std::string custom_analyzer; + std::string analyzer_name; InvertedIndexParserType parser_type; std::string parser_mode; std::string support_phrase; @@ -97,7 +97,8 @@ const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression"; -const std::string INVERTED_INDEX_CUSTOM_ANALYZER_KEY = "analyzer"; +const std::string INVERTED_INDEX_ANALYZER_NAME_KEY = "analyzer"; +const std::string INVERTED_INDEX_NORMALIZER_NAME_KEY = "normalizer"; std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); @@ -138,7 +139,6 @@ std::string get_parser_stopwords_from_properties( std::string get_parser_dict_compression_from_properties( const std::map& properties); -std::string get_custom_analyzer_string_from_properties( - const std::map& properties); +std::string get_analyzer_name_from_properties(const std::map& properties); } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp index 321cdaf1aa3c85..0dfbd9f134e049 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp @@ -19,8 +19,10 @@ #include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/empty_char_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/empty_token_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h" @@ -43,6 +45,9 @@ void AnalysisFactoryMgr::initialise() { "empty", []() { return std::make_shared(); }); registerFactory( "char_replace", []() { return std::make_shared(); }); + registerFactory("icu_normalizer", []() { + return std::make_shared(); + }); // tokenizer registerFactory( @@ -75,6 +80,8 @@ void AnalysisFactoryMgr::initialise() { "word_delimiter", []() { return std::make_shared(); }); registerFactory( "pinyin", []() { return std::make_shared(); }); + registerFactory( + "icu_normalizer", []() { return std::make_shared(); }); }); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index 71929d64273366..640362d4bcc54f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -125,7 +125,7 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy std::shared_ptr InvertedIndexAnalyzer::create_analyzer( const InvertedIndexCtx* inverted_index_ctx) { - const std::string& analyzer_name = inverted_index_ctx->custom_analyzer; + const std::string& analyzer_name = inverted_index_ctx->analyzer_name; if (analyzer_name.empty()) { return create_builtin_analyzer( inverted_index_ctx->parser_type, inverted_index_ctx->parser_mode, @@ -177,7 +177,7 @@ std::vector InvertedIndexAnalyzer::get_analyse_result( std::vector InvertedIndexAnalyzer::get_analyse_result( const std::string& search_str, const std::map& properties) { InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared( - get_custom_analyzer_string_from_properties(properties), + get_analyzer_name_from_properties(properties), get_inverted_index_parser_type_from_string( get_parser_string_from_properties(properties)), get_parser_mode_string_from_properties(properties), @@ -195,7 +195,7 @@ std::vector InvertedIndexAnalyzer::get_analyse_result( bool InvertedIndexAnalyzer::should_analyzer(const std::map& properties) { auto parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(properties)); - auto analyzer_name = get_custom_analyzer_string_from_properties(properties); + auto analyzer_name = get_analyzer_name_from_properties(properties); if (!analyzer_name.empty()) { return true; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h index 134d4ee0d45d78..2497d93450b36f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h @@ -19,7 +19,6 @@ #include #include -#include #include "olap/rowset/segment_v2/inverted_index/setting.h" diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.cpp new file mode 100644 index 00000000000000..254f55bc3c6ac7 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.cpp @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "icu_normalizer_char_filter.h" + +#include +#include + +#include "common/exception.h" +#include "common/logging.h" + +namespace doris::segment_v2::inverted_index { + +ICUNormalizerCharFilter::ICUNormalizerCharFilter(ReaderPtr reader, + std::shared_ptr normalizer) + : DorisCharFilter(std::move(reader)), _normalizer(std::move(normalizer)) { + if (_normalizer == nullptr) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "ICUNormalizerCharFilter: normalizer cannot be null"); + } +} + +void ICUNormalizerCharFilter::initialize() { + if (_transformed_input.size() != 0) { + return; + } + fill(); +} + +void ICUNormalizerCharFilter::init(const void* _value, int32_t _length, bool copyData) { + _reader->init(_value, _length, copyData); + fill(); +} + +int32_t ICUNormalizerCharFilter::read(const void** start, int32_t min, int32_t max) { + return _transformed_input.read(start, min, max); +} + +int32_t ICUNormalizerCharFilter::readCopy(void* start, int32_t off, int32_t len) { + return _transformed_input.readCopy(start, off, len); +} + +void ICUNormalizerCharFilter::fill() { + std::string input; + input.resize(_reader->size()); + _reader->readCopy(input.data(), 0, static_cast(input.size())); + normalize_text(input, _buf); + _transformed_input.init(_buf.data(), static_cast(_buf.size()), false); +} + +void ICUNormalizerCharFilter::normalize_text(const std::string& input, std::string& output) { + if (input.empty()) { + output.clear(); + return; + } + + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(input); + UNormalizationCheckResult quick_result = _normalizer->quickCheck(src16, status); + if (U_SUCCESS(status) && quick_result == UNORM_YES) { + output = input; + return; + } + + icu::UnicodeString result16; + status = U_ZERO_ERROR; + _normalizer->normalize(src16, result16, status); + if (U_FAILURE(status)) { + LOG(WARNING) << "ICU normalize failed: " << u_errorName(status) << ", using original text"; + output = input; + return; + } + + result16.toUTF8String(output); +} + +} // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.h new file mode 100644 index 00000000000000..503a08e2e1f204 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "char_filter.h" + +namespace doris::segment_v2::inverted_index { + +class ICUNormalizerCharFilter : public DorisCharFilter { +public: + ICUNormalizerCharFilter(ReaderPtr reader, std::shared_ptr normalizer); + ~ICUNormalizerCharFilter() override = default; + + void initialize() override; + + void init(const void* _value, int32_t _length, bool copyData) override; + int32_t read(const void** start, int32_t min, int32_t max) override; + int32_t readCopy(void* start, int32_t off, int32_t len) override; + + size_t size() override { return _buf.size(); } + +private: + void fill(); + void normalize_text(const std::string& input, std::string& output); + + std::shared_ptr _normalizer; + std::string _buf; + lucene::util::SStringReader _transformed_input; +}; +using ICUNormalizerCharFilterPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h new file mode 100644 index 00000000000000..25a49bff6b3ffc --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "char_filter_factory.h" +#include "common/exception.h" +#include "icu_normalizer_char_filter.h" + +namespace doris::segment_v2::inverted_index { + +class ICUNormalizerCharFilterFactory : public CharFilterFactory { +public: + ICUNormalizerCharFilterFactory() = default; + ~ICUNormalizerCharFilterFactory() override = default; + + void initialize(const Settings& settings) override { + std::string name = settings.get_string("name", "nfkc_cf"); + std::string mode = settings.get_string("mode", "compose"); + std::string unicode_set_filter = settings.get_string("unicode_set_filter", ""); + if (mode != "compose" && mode != "decompose") { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "ICUNormalizerCharFilterFactory: mode must be 'compose' or " + "'decompose', got: " + + mode); + } + + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2* base = get_normalizer(name, mode, status); + if (U_FAILURE(status) || base == nullptr) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Failed to get normalizer instance for '" + name + "' with mode '" + + mode + "': " + std::string(u_errorName(status))); + } + + if (unicode_set_filter.empty()) { + _normalizer = + std::shared_ptr(base, [](const icu::Normalizer2*) {}); + return; + } + + icu::UnicodeSet unicode_set(icu::UnicodeString::fromUTF8(unicode_set_filter), status); + if (U_FAILURE(status)) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse unicode_set_filter: " + + std::string(u_errorName(status))); + } + if (unicode_set.isEmpty()) { + _normalizer = + std::shared_ptr(base, [](const icu::Normalizer2*) {}); + return; + } + unicode_set.freeze(); + + _normalizer = std::make_shared(*base, unicode_set); + } + + ReaderPtr create(const ReaderPtr& in) override { + if (!_normalizer) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "ICUNormalizerCharFilterFactory not initialized. Call initialize() " + "first."); + } + return std::make_shared(in, _normalizer); + } + +private: + static const icu::Normalizer2* get_normalizer(const std::string& name, const std::string& mode, + UErrorCode& status) { + UNormalization2Mode icu_mode = (mode == "compose" ? UNORM2_COMPOSE : UNORM2_DECOMPOSE); + if (name == "nfc" || name == "nfkc" || name == "nfkc_cf") { + return icu::Normalizer2::getInstance(nullptr, name.c_str(), icu_mode, status); + } + + if (name == "nfd") { + return icu::Normalizer2::getNFDInstance(status); + } else if (name == "nfkd") { + return icu::Normalizer2::getNFKDInstance(status); + } + + status = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + std::shared_ptr _normalizer; +}; +using ICUNormalizerCharFilterFactoryPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.cpp new file mode 100644 index 00000000000000..d8e777b6b855e4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.cpp @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "custom_normalizer.h" + +#include "olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h" +#include "olap/rowset/segment_v2/inverted_index/token_stream.h" + +namespace doris::segment_v2::inverted_index { + +CustomNormalizer::CustomNormalizer(Builder* builder) { + _keyword_tokenizer = AnalysisFactoryMgr::instance().create("keyword", {}); + + _char_filters = std::move(builder->_char_filters); + _token_filters = std::move(builder->_token_filters); +} + +ReaderPtr CustomNormalizer::init_reader(ReaderPtr reader) { + for (const auto& filter : _char_filters) { + reader = filter->create(reader); + } + return reader; +} + +TokenStreamComponentsPtr CustomNormalizer::create_components() { + auto tk = _keyword_tokenizer->create(); + TokenStreamPtr ts = tk; + for (const auto& filter : _token_filters) { + ts = filter->create(ts); + } + return std::make_shared(tk, ts); +} + +TokenStream* CustomNormalizer::tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) { + throw Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, + "CustomNormalizer does not support lucene::util::Reader"); +} + +TokenStream* CustomNormalizer::reusableTokenStream(const TCHAR* fieldName, + lucene::util::Reader* reader) { + throw Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, + "CustomNormalizer does not support lucene::util::Reader"); +} + +TokenStream* CustomNormalizer::tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) { + auto r = init_reader(reader); + auto token_stream = create_components(); + token_stream->set_reader(r); + token_stream->get_token_stream()->reset(); + return new TokenStreamWrapper(token_stream->get_token_stream()); +} + +TokenStream* CustomNormalizer::reusableTokenStream(const TCHAR* fieldName, + const ReaderPtr& reader) { + auto r = init_reader(reader); + if (_reuse_token_stream == nullptr) { + _reuse_token_stream = create_components(); + } + _reuse_token_stream->set_reader(r); + return _reuse_token_stream->get_token_stream().get(); +} + +CustomNormalizerPtr CustomNormalizer::build_custom_normalizer( + const CustomNormalizerConfigPtr& config) { + if (config == nullptr) { + throw Exception(ErrorCode::ILLEGAL_STATE, "Null configuration detected."); + } + CustomNormalizer::Builder builder; + for (const auto& filter_config : config->get_char_filter_configs()) { + builder.add_char_filter(filter_config->get_name(), filter_config->get_params()); + } + for (const auto& filter_config : config->get_token_filter_configs()) { + builder.add_token_filter(filter_config->get_name(), filter_config->get_params()); + } + return builder.build(); +} + +void CustomNormalizer::Builder::add_char_filter(const std::string& name, const Settings& params) { + _char_filters.push_back(AnalysisFactoryMgr::instance().create(name, params)); +} + +void CustomNormalizer::Builder::add_token_filter(const std::string& name, const Settings& params) { + _token_filters.push_back( + AnalysisFactoryMgr::instance().create(name, params)); +} + +CustomNormalizerPtr CustomNormalizer::Builder::build() { + if (_char_filters.empty() && _token_filters.empty()) { + throw Exception(ErrorCode::ILLEGAL_STATE, + "Normalizer must have at least one char_filter or token_filter."); + } + return std::make_shared(this); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.h b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.h new file mode 100644 index 00000000000000..5e5d4949e07e72 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "custom_normalizer_config.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h" + +namespace doris::segment_v2::inverted_index { + +class CustomNormalizer; +using CustomNormalizerPtr = std::shared_ptr; + +class CustomNormalizer : public Analyzer { +public: + class Builder { + public: + Builder() = default; + ~Builder() = default; + + void add_char_filter(const std::string& name, const Settings& params); + void add_token_filter(const std::string& name, const Settings& params); + + CustomNormalizerPtr build(); + + private: + std::vector _char_filters; + std::vector _token_filters; + + friend class CustomNormalizer; + }; + + CustomNormalizer(Builder* builder); + ~CustomNormalizer() override = default; + + bool isSDocOpt() override { return true; } + + TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override; + TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override; + + TokenStream* tokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override; + TokenStream* reusableTokenStream(const TCHAR* fieldName, const ReaderPtr& reader) override; + + static CustomNormalizerPtr build_custom_normalizer(const CustomNormalizerConfigPtr& config); + +private: + ReaderPtr init_reader(ReaderPtr reader); + TokenStreamComponentsPtr create_components(); + + TokenizerFactoryPtr _keyword_tokenizer; + std::vector _char_filters; + std::vector _token_filters; + + TokenStreamComponentsPtr _reuse_token_stream; +}; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.cpp b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.cpp new file mode 100644 index 00000000000000..574a44764ead7f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.cpp @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "custom_normalizer_config.h" + +namespace doris::segment_v2::inverted_index { + +CustomNormalizerConfig::CustomNormalizerConfig(Builder* builder) { + _char_filters = builder->_char_filters; + _token_filters = builder->_token_filters; +} + +std::vector CustomNormalizerConfig::get_char_filter_configs() { + return _char_filters; +} + +std::vector CustomNormalizerConfig::get_token_filter_configs() { + return _token_filters; +} + +void CustomNormalizerConfig::Builder::add_char_filter_config(const std::string& name, + const Settings& params) { + _char_filters.emplace_back(std::make_shared(name, params)); +} + +void CustomNormalizerConfig::Builder::add_token_filter_config(const std::string& name, + const Settings& params) { + _token_filters.emplace_back(std::make_shared(name, params)); +} + +CustomNormalizerConfigPtr CustomNormalizerConfig::Builder::build() { + return std::make_shared(this); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.h new file mode 100644 index 00000000000000..0b4fcd8c663b73 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer_config.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h" + +namespace doris::segment_v2::inverted_index { + +class CustomNormalizerConfig; +using CustomNormalizerConfigPtr = std::shared_ptr; + +class CustomNormalizerConfig { +public: + class Builder { + public: + Builder() = default; + ~Builder() = default; + + void add_char_filter_config(const std::string& name, const Settings& params); + void add_token_filter_config(const std::string& name, const Settings& params); + CustomNormalizerConfigPtr build(); + + private: + std::vector _char_filters; + std::vector _token_filters; + + friend class CustomNormalizerConfig; + }; + + CustomNormalizerConfig(Builder* builder); + ~CustomNormalizerConfig() = default; + + std::vector get_char_filter_configs(); + std::vector get_token_filter_configs(); + +private: + std::vector _char_filters; + std::vector _token_filters; +}; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.cpp new file mode 100644 index 00000000000000..2f27af114944ce --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.cpp @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "icu_normalizer_filter.h" + +#include +#include + +#include "common/exception.h" +#include "common/logging.h" + +namespace doris::segment_v2::inverted_index { + +ICUNormalizerFilter::ICUNormalizerFilter(TokenStreamPtr in, + std::shared_ptr normalizer) + : DorisTokenFilter(std::move(in)), _normalizer(std::move(normalizer)) { + if (_normalizer == nullptr) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "ICUNormalizerFilter: normalizer cannot be null"); + } +} + +Token* ICUNormalizerFilter::next(Token* t) { + if (!_in->next(t)) { + return nullptr; + } + + const char* buffer = t->termBuffer(); + auto length = static_cast(t->termLength()); + + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(icu::StringPiece(buffer, length)); + UNormalizationCheckResult quick_result = _normalizer->quickCheck(src16, status); + if (U_SUCCESS(status) && quick_result == UNORM_YES) { + return t; + } + + icu::UnicodeString result16; + status = U_ZERO_ERROR; + _normalizer->normalize(src16, result16, status); + if (U_FAILURE(status)) { + LOG(WARNING) << "Normalize failed: " << u_errorName(status); + return t; + } + + _output_buffer.clear(); + result16.toUTF8String(_output_buffer); + + set_text(t, std::string_view(_output_buffer.data(), _output_buffer.size())); + + return t; +} + +void ICUNormalizerFilter::reset() { + DorisTokenFilter::reset(); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.h new file mode 100644 index 00000000000000..f0cdd955140d92 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include + +#include "token_filter.h" + +namespace doris::segment_v2::inverted_index { + +class ICUNormalizerFilter : public DorisTokenFilter { +public: + ICUNormalizerFilter(TokenStreamPtr in, std::shared_ptr normalizer); + ~ICUNormalizerFilter() override = default; + + Token* next(Token* t) override; + void reset() override; + +private: + std::shared_ptr _normalizer; + std::string _output_buffer; +}; +using ICUNormalizerFilterPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h new file mode 100644 index 00000000000000..33ebfde8b9b000 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "common/exception.h" +#include "icu_normalizer_filter.h" +#include "token_filter_factory.h" + +namespace doris::segment_v2::inverted_index { + +class ICUNormalizerFilterFactory : public TokenFilterFactory { +public: + ICUNormalizerFilterFactory() = default; + ~ICUNormalizerFilterFactory() override = default; + + void initialize(const Settings& settings) override { + std::string name = settings.get_string("name", "nfkc_cf"); + std::string unicode_set_filter = settings.get_string("unicode_set_filter", ""); + + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2* base = get_normalizer(name, status); + if (U_FAILURE(status) || base == nullptr) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "Failed to get normalizer instance for '" + name + + "': " + std::string(u_errorName(status))); + } + + if (unicode_set_filter.empty()) { + _normalizer = + std::shared_ptr(base, [](const icu::Normalizer2*) {}); + return; + } + + icu::UnicodeSet unicode_set(icu::UnicodeString::fromUTF8(unicode_set_filter), status); + if (U_FAILURE(status)) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse unicode_set_filter: " + + std::string(u_errorName(status))); + } + if (unicode_set.isEmpty()) { + _normalizer = + std::shared_ptr(base, [](const icu::Normalizer2*) {}); + return; + } + unicode_set.freeze(); + + _normalizer = std::make_shared(*base, unicode_set); + } + + TokenFilterPtr create(const TokenStreamPtr& in) override { + if (!_normalizer) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "ICUNormalizerFilterFactory not initialized. Call initialize() first."); + } + return std::make_shared(in, _normalizer); + } + +private: + static const icu::Normalizer2* get_normalizer(const std::string& name, UErrorCode& status) { + if (name == "nfc" || name == "nfkc" || name == "nfkc_cf") { + return icu::Normalizer2::getInstance(nullptr, name.c_str(), UNORM2_COMPOSE, status); + } else if (name == "nfd") { + return icu::Normalizer2::getNFDInstance(status); + } else if (name == "nfkd") { + return icu::Normalizer2::getNFKDInstance(status); + } + status = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + std::shared_ptr _normalizer; +}; +using ICUNormalizerFilterFactoryPtr = std::shared_ptr; + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 1f5d8ab561d254..816c0819e01fc1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -206,7 +206,7 @@ InvertedIndexColumnWriter::create_analyzer( template Status InvertedIndexColumnWriter::init_fulltext_index() { _inverted_index_ctx = std::make_shared( - get_custom_analyzer_string_from_properties(_index_meta->properties()), + get_analyzer_name_from_properties(_index_meta->properties()), get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())), get_parser_mode_string_from_properties(_index_meta->properties()), diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 429e102c9a2849..79147a65249a11 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -346,7 +346,8 @@ class TabletIndex : public MetadataAdder { void remove_parser_and_analyzer() { _properties.erase(INVERTED_INDEX_PARSER_KEY); _properties.erase(INVERTED_INDEX_PARSER_KEY_ALIAS); - _properties.erase(INVERTED_INDEX_CUSTOM_ANALYZER_KEY); + _properties.erase(INVERTED_INDEX_ANALYZER_NAME_KEY); + _properties.erase(INVERTED_INDEX_NORMALIZER_NAME_KEY); } std::string field_pattern() const { diff --git a/be/src/runtime/index_policy/index_policy_mgr.cpp b/be/src/runtime/index_policy/index_policy_mgr.cpp index 035dd19f35e802..eeb5576330b184 100644 --- a/be/src/runtime/index_policy/index_policy_mgr.cpp +++ b/be/src/runtime/index_policy/index_policy_mgr.cpp @@ -20,10 +20,13 @@ #include #include #include +#include #include namespace doris { +const std::unordered_set IndexPolicyMgr::BUILTIN_NORMALIZERS = {"lowercase"}; + void IndexPolicyMgr::apply_policy_changes(const std::vector& policys_to_update, const std::vector& policys_to_delete) { LOG(INFO) << "Starting policy changes - " @@ -85,38 +88,49 @@ const Policys& IndexPolicyMgr::get_index_policys() { } // TODO: Potential high-concurrency bottleneck -segment_v2::inverted_index::CustomAnalyzerPtr IndexPolicyMgr::get_policy_by_name( - const std::string& name) { +AnalyzerPtr IndexPolicyMgr::get_policy_by_name(const std::string& name) { std::shared_lock lock(_mutex); - // Check if policy exists auto name_it = _name_to_id.find(name); if (name_it == _name_to_id.end()) { + if (is_builtin_normalizer(name)) { + return build_builtin_normalizer(name); + } throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with name: " + name); } - // Get policy by ID auto policy_it = _policys.find(name_it->second); if (policy_it == _policys.end()) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with name: " + name); + throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with id: " + name); } - const auto& index_policy_analyzer = policy_it->second; + const auto& index_policy = policy_it->second; + if (index_policy.type == TIndexPolicyType::ANALYZER) { + return build_analyzer_from_policy(index_policy); + } else if (index_policy.type == TIndexPolicyType::NORMALIZER) { + return build_normalizer_from_policy(index_policy); + } + + throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with type: " + name); +} + +AnalyzerPtr IndexPolicyMgr::build_analyzer_from_policy(const TIndexPolicy& index_policy_analyzer) { segment_v2::inverted_index::CustomAnalyzerConfig::Builder builder; - // Process tokenizer auto tokenizer_it = index_policy_analyzer.properties.find(PROP_TOKENIZER); if (tokenizer_it == index_policy_analyzer.properties.end() || tokenizer_it->second.empty()) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "Invalid tokenizer configuration in policy: " + name); + throw Exception( + ErrorCode::INVALID_ARGUMENT, + "Invalid tokenizer configuration in policy: analyzer must have a tokenizer"); } - const auto& tokenzier_name = tokenizer_it->second; - if (_name_to_id.contains(tokenzier_name)) { - const auto& tokenizer_policy = _policys[_name_to_id[tokenzier_name]]; + + const auto& tokenizer_name = tokenizer_it->second; + if (_name_to_id.contains(tokenizer_name)) { + const auto& tokenizer_policy = _policys[_name_to_id[tokenizer_name]]; auto type_it = tokenizer_policy.properties.find(PROP_TYPE); if (type_it == tokenizer_policy.properties.end()) { throw Exception(ErrorCode::INVALID_ARGUMENT, - "Invalid tokenizer configuration in policy: " + tokenzier_name); + "Invalid tokenizer configuration in policy: " + tokenizer_name); } segment_v2::inverted_index::Settings settings; @@ -127,17 +141,15 @@ segment_v2::inverted_index::CustomAnalyzerPtr IndexPolicyMgr::get_policy_by_name } builder.with_tokenizer_config(type_it->second, settings); } else { - builder.with_tokenizer_config(tokenzier_name, {}); + builder.with_tokenizer_config(tokenizer_name, {}); } - // Process char filters process_filter_configs(index_policy_analyzer, PROP_CHAR_FILTER, "char filter", [&builder](const std::string& name, const segment_v2::inverted_index::Settings& settings) { builder.add_char_filter_config(name, settings); }); - // Process token filters process_filter_configs(index_policy_analyzer, PROP_TOKEN_FILTER, "token filter", [&builder](const std::string& name, const segment_v2::inverted_index::Settings& settings) { @@ -149,6 +161,27 @@ segment_v2::inverted_index::CustomAnalyzerPtr IndexPolicyMgr::get_policy_by_name custom_analyzer_config); } +AnalyzerPtr IndexPolicyMgr::build_normalizer_from_policy( + const TIndexPolicy& index_policy_normalizer) { + segment_v2::inverted_index::CustomNormalizerConfig::Builder builder; + + process_filter_configs(index_policy_normalizer, PROP_CHAR_FILTER, "char filter", + [&builder](const std::string& name, + const segment_v2::inverted_index::Settings& settings) { + builder.add_char_filter_config(name, settings); + }); + + process_filter_configs(index_policy_normalizer, PROP_TOKEN_FILTER, "token filter", + [&builder](const std::string& name, + const segment_v2::inverted_index::Settings& settings) { + builder.add_token_filter_config(name, settings); + }); + + auto custom_normalizer_config = builder.build(); + return segment_v2::inverted_index::CustomNormalizer::build_custom_normalizer( + custom_normalizer_config); +} + void IndexPolicyMgr::process_filter_configs( const TIndexPolicy& index_policy_analyzer, const std::string& prop_name, const std::string& error_prefix, @@ -192,4 +225,21 @@ void IndexPolicyMgr::process_filter_configs( } } +bool IndexPolicyMgr::is_builtin_normalizer(const std::string& name) { + return BUILTIN_NORMALIZERS.contains(name); +} + +AnalyzerPtr IndexPolicyMgr::build_builtin_normalizer(const std::string& name) { + using namespace segment_v2::inverted_index; + + if (name == "lowercase") { + CustomNormalizerConfig::Builder builder; + builder.add_token_filter_config("lowercase", Settings {}); + auto config = builder.build(); + return CustomNormalizer::build_custom_normalizer(config); + } + + throw Exception(ErrorCode::INVALID_ARGUMENT, "Unknown builtin normalizer: " + name); +} + } // namespace doris \ No newline at end of file diff --git a/be/src/runtime/index_policy/index_policy_mgr.h b/be/src/runtime/index_policy/index_policy_mgr.h index 707270930fe4cb..a30cec7bfdb9d3 100644 --- a/be/src/runtime/index_policy/index_policy_mgr.h +++ b/be/src/runtime/index_policy/index_policy_mgr.h @@ -20,12 +20,15 @@ #include #include +#include #include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/normalizer/custom_normalizer.h" namespace doris { using Policys = std::unordered_map; +using AnalyzerPtr = std::shared_ptr; class IndexPolicyMgr { public: @@ -36,13 +39,11 @@ class IndexPolicyMgr { const std::vector& policies_to_delete); const Policys& get_index_policys(); - segment_v2::inverted_index::CustomAnalyzerPtr get_policy_by_name(const std::string& name); + AnalyzerPtr get_policy_by_name(const std::string& name); private: - constexpr static auto PROP_TOKENIZER = "tokenizer"; - constexpr static auto PROP_CHAR_FILTER = "char_filter"; - constexpr static auto PROP_TOKEN_FILTER = "token_filter"; - constexpr static auto PROP_TYPE = "type"; + AnalyzerPtr build_analyzer_from_policy(const TIndexPolicy& index_policy_analyzer); + AnalyzerPtr build_normalizer_from_policy(const TIndexPolicy& index_policy_normalizer); void process_filter_configs( const TIndexPolicy& index_policy_analyzer, const std::string& prop_name, @@ -50,6 +51,16 @@ class IndexPolicyMgr { std::function add_config_func); + bool is_builtin_normalizer(const std::string& name); + AnalyzerPtr build_builtin_normalizer(const std::string& name); + + constexpr static auto PROP_TOKENIZER = "tokenizer"; + constexpr static auto PROP_CHAR_FILTER = "char_filter"; + constexpr static auto PROP_TOKEN_FILTER = "token_filter"; + constexpr static auto PROP_TYPE = "type"; + + static const std::unordered_set BUILTIN_NORMALIZERS; + std::shared_mutex _mutex; Policys _policys; diff --git a/be/src/vec/exprs/vmatch_predicate.cpp b/be/src/vec/exprs/vmatch_predicate.cpp index 83698f1e43db5e..33781be820f0a8 100644 --- a/be/src/vec/exprs/vmatch_predicate.cpp +++ b/be/src/vec/exprs/vmatch_predicate.cpp @@ -58,7 +58,7 @@ using namespace doris::segment_v2; VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) { _inverted_index_ctx = std::make_shared(); - _inverted_index_ctx->custom_analyzer = node.match_predicate.custom_analyzer; + _inverted_index_ctx->analyzer_name = node.match_predicate.analyzer_name; _inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string(node.match_predicate.parser_type); _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode; diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 17a660da377f27..01524ccf0ed0a9 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -1445,6 +1445,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_alias(FunctionLeft::name, "strleft"); factory.register_alias(FunctionRight::name, "strright"); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 343d0d28d45e81..fd8378b9d443e8 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -81,6 +81,9 @@ #endif #include +#include +#include +#include #include #include @@ -5258,5 +5261,168 @@ class FunctionCrc32Internal : public IFunction { } }; +class FunctionUnicodeNormalize : public IFunction { +public: + static constexpr auto name = "unicode_normalize"; + + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + if (arguments.size() != 2 || !is_string_type(arguments[0]->get_primitive_type()) || + !is_string_type(arguments[1]->get_primitive_type())) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Illegal type {} and {} of arguments of function {}", + arguments[0]->get_name(), arguments[1]->get_name(), get_name()); + } + return arguments[0]; + } + + ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; } + + Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { + if (scope == FunctionContext::THREAD_LOCAL) { + return Status::OK(); + } + + if (!context->is_col_constant(1)) { + return Status::InvalidArgument( + "The second argument 'mode' of function {} must be constant", get_name()); + } + + auto* const_col = context->get_constant_col(1); + auto mode_ref = const_col->column_ptr->get_data_at(0); + std::string mode = mode_ref.to_string(); + trim(mode); + + std::string lower_mode; + lower_mode.reserve(mode.size()); + for (char c : mode) { + lower_mode.push_back(static_cast(std::tolower(static_cast(c)))); + } + + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2* normalizer = nullptr; + + if (lower_mode == "nfc") { + normalizer = icu::Normalizer2::getInstance(nullptr, "nfc", UNORM2_COMPOSE, status); + } else if (lower_mode == "nfd") { + normalizer = icu::Normalizer2::getNFDInstance(status); + } else if (lower_mode == "nfkc") { + normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status); + } else if (lower_mode == "nfkd") { + normalizer = icu::Normalizer2::getNFKDInstance(status); + } else if (lower_mode == "nfkc_cf") { + normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status); + } else { + return Status::InvalidArgument( + "Invalid normalization mode '{}' for function {}. " + "Supported modes: NFC, NFD, NFKC, NFKD, NFKC_CF", + mode, get_name()); + } + + if (U_FAILURE(status) || normalizer == nullptr) { + return Status::InvalidArgument( + "Failed to get normalizer instance for mode '{}' in function {}: {}", mode, + get_name(), u_errorName(status)); + } + + auto state = std::make_shared(); + state->normalizer = normalizer; + context->set_function_state(scope, state); + return Status::OK(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + auto* state = reinterpret_cast( + context->get_function_state(FunctionContext::FRAGMENT_LOCAL)); + if (state == nullptr || state->normalizer == nullptr) { + return Status::RuntimeError("unicode_normalize function state is not initialized"); + } + + ColumnPtr col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* col_str = check_and_get_column(col.get()); + if (col_str == nullptr) { + return Status::RuntimeError("Illegal column {} of argument of function {}", + block.get_by_position(arguments[0]).column->get_name(), + get_name()); + } + + const auto& data = col_str->get_chars(); + const auto& offsets = col_str->get_offsets(); + + auto res = ColumnString::create(); + auto& res_data = res->get_chars(); + auto& res_offsets = res->get_offsets(); + + size_t rows = offsets.size(); + res_offsets.resize(rows); + + std::string tmp; + for (size_t i = 0; i < rows; ++i) { + const char* begin = reinterpret_cast(&data[offsets[i - 1]]); + size_t len = offsets[i] - offsets[i - 1]; + + normalize_one(state->normalizer, begin, len, tmp); + StringOP::push_value_string(tmp, i, res_data, res_offsets); + } + + block.replace_by_position(result, std::move(res)); + return Status::OK(); + } + +private: + struct UnicodeNormalizeState { + const icu::Normalizer2* normalizer = nullptr; + }; + + static void trim(std::string& s) { + size_t l = 0; + while (l < s.size() && std::isspace(static_cast(s[l]))) { + ++l; + } + size_t r = s.size(); + while (r > l && std::isspace(static_cast(s[r - 1]))) { + --r; + } + if (l != 0 || r != s.size()) { + s = s.substr(l, r - l); + } + } + + static void normalize_one(const icu::Normalizer2* normalizer, const char* input, size_t length, + std::string& output) { + if (length == 0) { + output.clear(); + return; + } + + icu::StringPiece sp(input, static_cast(length)); + icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(sp); + + UErrorCode status = U_ZERO_ERROR; + UNormalizationCheckResult quick = normalizer->quickCheck(src16, status); + if (U_SUCCESS(status) && quick == UNORM_YES) { + output.assign(input, length); + return; + } + + icu::UnicodeString result16; + status = U_ZERO_ERROR; + normalizer->normalize(src16, result16, status); + if (U_FAILURE(status)) { + output.assign(input, length); + return; + } + + result16.toUTF8String(output); + } +}; + #include "common/compile_check_avoid_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 02e597169ef636..7364947abecc38 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -152,8 +152,7 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block if (!st.ok()) { return st; } - inverted_index_ctx.custom_analyzer = - get_custom_analyzer_string_from_properties(properties); + inverted_index_ctx.analyzer_name = get_analyzer_name_from_properties(properties); inverted_index_ctx.parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(properties)); if (inverted_index_ctx.parser_type == InvertedIndexParserType::PARSER_UNKNOWN) { @@ -163,7 +162,7 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block } // Special handling for PARSER_NONE: return original string as single token - if (inverted_index_ctx.custom_analyzer.empty() && + if (inverted_index_ctx.analyzer_name.empty() && inverted_index_ctx.parser_type == InvertedIndexParserType::PARSER_NONE) { _do_tokenize_none(*col_left, dest_column_ptr); block.replace_by_position(result, std::move(dest_column_ptr)); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 8009374e4af3a9..c3fec54ddfa49c 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -181,7 +181,7 @@ std::vector FunctionMatchBase::analyse_query_str_token( } // parse is none and custom analyzer is empty mean no analyzer is set if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE && - inverted_index_ctx->custom_analyzer.empty()) { + inverted_index_ctx->analyzer_name.empty()) { query_tokens.emplace_back(match_query_str); return query_tokens; } @@ -204,7 +204,7 @@ inline std::vector FunctionMatchBase::analyse_data_token( const auto& str_ref = string_col->get_data_at(current_src_array_offset); // parse is none and custom analyzer is empty mean no analyzer is set if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE && - inverted_index_ctx->custom_analyzer.empty()) { + inverted_index_ctx->analyzer_name.empty()) { data_tokens.emplace_back(str_ref.to_string()); continue; } @@ -220,7 +220,7 @@ inline std::vector FunctionMatchBase::analyse_data_token( const auto& str_ref = string_col->get_data_at(current_block_row_idx); // parse is none and custom analyzer is empty mean no analyzer is set if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE && - inverted_index_ctx->custom_analyzer.empty()) { + inverted_index_ctx->analyzer_name.empty()) { data_tokens.emplace_back(str_ref.to_string()); } else { auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader( diff --git a/be/test/olap/inverted_index_parser_test.cpp b/be/test/olap/inverted_index_parser_test.cpp index 5b62b8fc4b3d9d..07520c2ef27b91 100644 --- a/be/test/olap/inverted_index_parser_test.cpp +++ b/be/test/olap/inverted_index_parser_test.cpp @@ -246,6 +246,23 @@ TEST_F(InvertedIndexParserTest, TestGetParserDictCompressionFromProperties) { EXPECT_EQ(get_parser_dict_compression_from_properties(properties), "false"); } +TEST_F(InvertedIndexParserTest, TestGetAnalyzerNameFromProperties) { + std::map properties; + + EXPECT_EQ(get_analyzer_name_from_properties(properties), ""); + + properties[INVERTED_INDEX_ANALYZER_NAME_KEY] = "my_analyzer"; + EXPECT_EQ(get_analyzer_name_from_properties(properties), "my_analyzer"); + + properties[INVERTED_INDEX_ANALYZER_NAME_KEY] = ""; + properties[INVERTED_INDEX_NORMALIZER_NAME_KEY] = "my_normalizer"; + EXPECT_EQ(get_analyzer_name_from_properties(properties), "my_normalizer"); + + properties[INVERTED_INDEX_ANALYZER_NAME_KEY] = "another_analyzer"; + properties[INVERTED_INDEX_NORMALIZER_NAME_KEY] = "another_normalizer"; + EXPECT_EQ(get_analyzer_name_from_properties(properties), "another_analyzer"); +} + // Test InvertedIndexCtx structure TEST_F(InvertedIndexParserTest, TestInvertedIndexCtxStructure) { InvertedIndexCtx ctx; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp index 46fc3077419df9..8c6e2328b934fb 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/ananlyzer/analyzer_test.cpp @@ -159,7 +159,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) { // Test Case 1: Empty custom_analyzer, use builtin parser_type { InvertedIndexCtx ctx; - ctx.custom_analyzer = ""; + ctx.analyzer_name = ""; ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD; ctx.parser_mode = ""; ctx.lower_case = INVERTED_INDEX_PARSER_TRUE; @@ -172,7 +172,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) { // Test Case 2: custom_analyzer is a builtin name (using one that doesn't need dict) { InvertedIndexCtx ctx; - ctx.custom_analyzer = INVERTED_INDEX_PARSER_ENGLISH; + ctx.analyzer_name = INVERTED_INDEX_PARSER_ENGLISH; ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN; ctx.parser_mode = ""; ctx.lower_case = INVERTED_INDEX_PARSER_FALSE; @@ -195,7 +195,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) { for (const auto& [name, requires_dict] : builtin_names) { InvertedIndexCtx ctx; - ctx.custom_analyzer = name; + ctx.analyzer_name = name; ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN; ctx.parser_mode = ""; ctx.lower_case = ""; @@ -227,7 +227,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzer) { for (const auto& [parser_type, requires_dict] : parser_types) { InvertedIndexCtx ctx; - ctx.custom_analyzer = ""; + ctx.analyzer_name = ""; ctx.parser_type = parser_type; ctx.parser_mode = ""; ctx.lower_case = ""; @@ -256,7 +256,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) { // Test when index_policy_mgr is null - should throw exception { InvertedIndexCtx ctx; - ctx.custom_analyzer = "non_existent_custom"; + ctx.analyzer_name = "non_existent_custom"; ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN; ctx.parser_mode = ""; ctx.lower_case = ""; @@ -287,7 +287,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) { // Test successful custom analyzer retrieval { InvertedIndexCtx ctx; - ctx.custom_analyzer = "test_custom_analyzer"; + ctx.analyzer_name = "test_custom_analyzer"; ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN; ctx.parser_mode = ""; ctx.lower_case = ""; @@ -300,7 +300,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) { // Test non-existent custom analyzer throws exception { InvertedIndexCtx ctx; - ctx.custom_analyzer = "non_existent_analyzer"; + ctx.analyzer_name = "non_existent_analyzer"; ctx.parser_type = InvertedIndexParserType::PARSER_UNKNOWN; ctx.parser_mode = ""; ctx.lower_case = ""; @@ -316,7 +316,7 @@ TEST_F(AnalyzerTest, TestCreateAnalyzerWithCustomPolicy) { TEST_F(AnalyzerTest, TestAnalyzerFunctionality) { // Create an analyzer and test it can tokenize text properly InvertedIndexCtx ctx; - ctx.custom_analyzer = ""; + ctx.analyzer_name = ""; ctx.parser_type = InvertedIndexParserType::PARSER_STANDARD; ctx.parser_mode = ""; ctx.lower_case = INVERTED_INDEX_PARSER_TRUE; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory_test.cpp new file mode 100644 index 00000000000000..40f6aabd59d3ab --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory_test.cpp @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h" + +#include +#include +#include + +#include +#include + +using namespace lucene::analysis; + +namespace doris::segment_v2::inverted_index { + +namespace { + +ReaderPtr make_reader(const std::string& text) { + auto reader = std::make_shared>(); + reader->init(text.data(), static_cast(text.size()), false); + return reader; +} + +std::string read_all(const ReaderPtr& reader) { + const void* data = nullptr; + int32_t len = reader->read(&data, 0, static_cast(reader->size())); + if (len <= 0 || data == nullptr) { + return {}; + } + return std::string(static_cast(data), len); +} + +std::string normalize_with_nfkc_cf(const std::string& text) { + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2* normalizer = + icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status); + if (U_FAILURE(status) || normalizer == nullptr) { + return text; + } + + icu::UnicodeString src = icu::UnicodeString::fromUTF8(text); + icu::UnicodeString dst; + status = U_ZERO_ERROR; + normalizer->normalize(src, dst, status); + if (U_FAILURE(status)) { + return text; + } + + std::string result; + dst.toUTF8String(result); + return result; +} + +} // namespace + +class ICUNormalizerCharFilterFactoryTest : public ::testing::Test {}; + +TEST_F(ICUNormalizerCharFilterFactoryTest, DefaultNormalizationMatchesICU) { + std::string input = "Cafe\u0301 and co\uFB03ee"; + Settings settings; + + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + + std::string result = read_all(filter); + std::string expected = normalize_with_nfkc_cf(input); + + EXPECT_EQ(result, expected); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, AlreadyNormalizedQuickCheck) { + std::string input = "cafe"; + Settings settings; + + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + + std::string result = read_all(filter); + EXPECT_EQ(result, input); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, ComposeAndDecomposeModes) { + std::string input = "Cafe\u0301"; + + { + Settings settings; + settings.set("name", "nfc"); + settings.set("mode", "compose"); + + ICUNormalizerCharFilterFactory factory; + EXPECT_NO_THROW(factory.initialize(settings)); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + std::string result = read_all(filter); + EXPECT_FALSE(result.empty()); + } + + { + Settings settings; + settings.set("name", "nfc"); + settings.set("mode", "decompose"); + + ICUNormalizerCharFilterFactory factory; + EXPECT_NO_THROW(factory.initialize(settings)); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + std::string result = read_all(filter); + EXPECT_FALSE(result.empty()); + } +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, InvalidModeThrows) { + Settings settings; + settings.set("mode", "invalid_mode"); + + ICUNormalizerCharFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, InvalidNameThrows) { + Settings settings; + settings.set("name", "unknown_normalizer"); + + ICUNormalizerCharFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, EmptyUnicodeSetFilterUsesBaseNormalizer) { + std::string input = "Cafe\u0301"; + + Settings settings; + settings.set("name", "nfkc_cf"); + settings.set("mode", "compose"); + settings.set("unicode_set_filter", ""); + + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + + std::string result = read_all(filter); + std::string expected = normalize_with_nfkc_cf(input); + EXPECT_EQ(result, expected); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, InvalidUnicodeSetFilterThrows) { + Settings settings; + settings.set("unicode_set_filter", "[invalid"); + + ICUNormalizerCharFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, EmptyUnicodeSetFallsBackToBase) { + std::string input = "Cafe\u0301"; + + Settings settings; + settings.set("unicode_set_filter", "[]"); + + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + + std::string result = read_all(filter); + std::string expected = normalize_with_nfkc_cf(input); + EXPECT_EQ(result, expected); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, NonEmptyUnicodeSetFilterCreatesFilteredNormalizer) { + std::string input = "Cafe\u0301 123"; + + Settings settings; + settings.set("unicode_set_filter", "[A-Za-z]"); + + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + filter->init(input.data(), static_cast(input.size()), false); + + std::string result = read_all(filter); + EXPECT_FALSE(result.empty()); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, CreateWithoutInitializeThrows) { + ICUNormalizerCharFilterFactory factory; + + auto reader = make_reader("test"); + EXPECT_THROW(factory.create(reader), Exception); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, EmptyInput) { + std::string input; + + Settings settings; + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = factory.create(reader); + + filter->init(input.data(), static_cast(input.size()), false); + + const void* data = nullptr; + int32_t len = filter->read(&data, 0, static_cast(filter->size())); + EXPECT_EQ(len, -1); + EXPECT_TRUE(data == nullptr || filter->size() == 0); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, NullNormalizerInFilterThrows) { + auto reader = make_reader("test"); + std::shared_ptr normalizer = nullptr; + EXPECT_THROW(ICUNormalizerCharFilter(reader, normalizer), Exception); +} + +TEST_F(ICUNormalizerCharFilterFactoryTest, InitializeOnlyFillsOnce) { + std::string input = "Cafe\u0301"; + + Settings settings; + ICUNormalizerCharFilterFactory factory; + factory.initialize(settings); + + auto reader = make_reader(input); + auto filter = std::dynamic_pointer_cast(factory.create(reader)); + ASSERT_NE(filter, nullptr); + + filter->initialize(); + std::string first = read_all(filter); + EXPECT_FALSE(first.empty()); + EXPECT_EQ(first, "café"); + + filter->initialize(); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory_test.cpp new file mode 100644 index 00000000000000..771d63adcd5063 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory_test.cpp @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h" + +#include +#include +#include + +#include +#include +#include + +#include "CLucene.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h" +#include "olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2::inverted_index { + +namespace { + +TokenizerPtr create_tokenizer(const std::string& tokenizer_type, const std::string& text) { + auto reader = std::make_shared>(); + reader->init(text.data(), static_cast(text.size()), false); + + TokenizerPtr tokenizer; + Settings settings; + + if (tokenizer_type == "standard") { + StandardTokenizerFactory factory; + factory.initialize(settings); + tokenizer = factory.create(); + } else if (tokenizer_type == "keyword") { + KeywordTokenizerFactory factory; + factory.initialize(settings); + tokenizer = factory.create(); + } else { + throw std::invalid_argument("Unknown tokenizer type: " + tokenizer_type); + } + + tokenizer->set_reader(reader); + tokenizer->reset(); + return tokenizer; +} + +std::vector collect_tokens(TokenFilterPtr filter) { + std::vector tokens; + Token t; + while (filter->next(&t) != nullptr) { + tokens.emplace_back(t.termBuffer(), t.termLength()); + } + return tokens; +} + +std::string normalize_with_nfkc_cf(const std::string& text) { + UErrorCode status = U_ZERO_ERROR; + const icu::Normalizer2* normalizer = + icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status); + if (U_FAILURE(status) || normalizer == nullptr) { + return text; + } + + icu::UnicodeString src = icu::UnicodeString::fromUTF8(text); + icu::UnicodeString dst; + status = U_ZERO_ERROR; + normalizer->normalize(src, dst, status); + if (U_FAILURE(status)) { + return text; + } + + std::string result; + dst.toUTF8String(result); + return result; +} + +std::vector split_on_space(const std::string& text) { + std::vector parts; + std::string current; + for (char c : text) { + if (c == ' ') { + if (!current.empty()) { + parts.push_back(current); + current.clear(); + } + } else { + current.push_back(c); + } + } + if (!current.empty()) { + parts.push_back(current); + } + return parts; +} + +} // namespace + +class ICUNormalizerFilterFactoryTest : public ::testing::Test {}; + +TEST_F(ICUNormalizerFilterFactoryTest, DefaultNormalizationKeywordTokenizer) { + std::string input = "Cafe\u0301 and co\uFB03ee"; + + Settings settings; + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("keyword", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + ASSERT_EQ(tokens.size(), 1); + + std::string expected = normalize_with_nfkc_cf(input); + EXPECT_EQ(tokens[0], expected); +} + +TEST_F(ICUNormalizerFilterFactoryTest, DefaultNormalizationStandardTokenizer) { + std::string input = "Cafe\u0301 resume\u0301"; + + Settings settings; + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("standard", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + ASSERT_EQ(tokens.size(), 2); + + std::string normalized = normalize_with_nfkc_cf(input); + auto expected_tokens = split_on_space(normalized); + ASSERT_EQ(expected_tokens.size(), 2); + + EXPECT_EQ(tokens[0], expected_tokens[0]); + EXPECT_EQ(tokens[1], expected_tokens[1]); +} + +TEST_F(ICUNormalizerFilterFactoryTest, QuickCheckReturnsOriginal) { + std::string input = "abc123"; + + Settings settings; + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("keyword", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + ASSERT_EQ(tokens.size(), 1); + EXPECT_EQ(tokens[0], input); +} + +TEST_F(ICUNormalizerFilterFactoryTest, EmptyInputProducesNoTokens) { + std::string input; + + Settings settings; + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("keyword", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + EXPECT_TRUE(tokens.empty()); +} + +TEST_F(ICUNormalizerFilterFactoryTest, InvalidNameThrows) { + Settings settings; + settings.set("name", "unknown_normalizer"); + + ICUNormalizerFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(ICUNormalizerFilterFactoryTest, EmptyUnicodeSetFilterUsesBaseNormalizer) { + std::string input = "Cafe\u0301"; + + Settings settings; + settings.set("unicode_set_filter", ""); + + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("keyword", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + ASSERT_EQ(tokens.size(), 1); + + std::string expected = normalize_with_nfkc_cf(input); + EXPECT_EQ(tokens[0], expected); +} + +TEST_F(ICUNormalizerFilterFactoryTest, InvalidUnicodeSetFilterThrows) { + Settings settings; + settings.set("unicode_set_filter", "[invalid"); + + ICUNormalizerFilterFactory factory; + EXPECT_THROW(factory.initialize(settings), Exception); +} + +TEST_F(ICUNormalizerFilterFactoryTest, EmptyUnicodeSetFallsBackToBase) { + std::string input = "Cafe\u0301"; + + Settings settings; + settings.set("unicode_set_filter", "[]"); + + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("keyword", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + ASSERT_EQ(tokens.size(), 1); + + std::string expected = normalize_with_nfkc_cf(input); + EXPECT_EQ(tokens[0], expected); +} + +TEST_F(ICUNormalizerFilterFactoryTest, NonEmptyUnicodeSetCreatesFilteredNormalizer) { + std::string input = "Cafe\u0301 123"; + + Settings settings; + settings.set("unicode_set_filter", "[A-Za-z]"); + + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("standard", input); + auto filter = factory.create(tokenizer); + + auto tokens = collect_tokens(filter); + EXPECT_GE(tokens.size(), 1u); +} + +TEST_F(ICUNormalizerFilterFactoryTest, CreateWithoutInitializeThrows) { + ICUNormalizerFilterFactory factory; + + auto tokenizer = create_tokenizer("keyword", "test"); + EXPECT_THROW(factory.create(tokenizer), Exception); +} + +TEST_F(ICUNormalizerFilterFactoryTest, NullNormalizerInFilterThrows) { + auto tokenizer = create_tokenizer("keyword", "test"); + std::shared_ptr normalizer = nullptr; + + EXPECT_THROW(ICUNormalizerFilter(tokenizer, normalizer), Exception); +} + +TEST_F(ICUNormalizerFilterFactoryTest, ResetResetsUnderlyingStream) { + std::string input = "Cafe\u0301 resume\u0301"; + + Settings settings; + ICUNormalizerFilterFactory factory; + factory.initialize(settings); + + auto tokenizer = create_tokenizer("standard", input); + auto filter = factory.create(tokenizer); + + Token t; + std::vector first_pass; + while (filter->next(&t) != nullptr) { + first_pass.emplace_back(t.termBuffer(), t.termLength()); + } + + auto reader2 = std::make_shared>(); + reader2->init(input.data(), static_cast(input.size()), false); + tokenizer->set_reader(reader2); + filter->reset(); + + std::vector second_pass; + while (filter->next(&t) != nullptr) { + second_pass.emplace_back(t.termBuffer(), t.termLength()); + } + + EXPECT_EQ(first_pass, second_pass); +} + +} // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp index 38b17251b8729f..8a3450e1e8423d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp @@ -39,7 +39,7 @@ TEST(ReaderTest, ArrayFieldTokenStreamWorkflow) { // 正确创建 InvertedIndexCtx auto inverted_index_ctx = std::make_shared(); - inverted_index_ctx->custom_analyzer = ""; + inverted_index_ctx->analyzer_name = ""; inverted_index_ctx->parser_type = InvertedIndexParserType::PARSER_STANDARD; inverted_index_ctx->parser_mode = "standard"; inverted_index_ctx->support_phrase = "yes"; diff --git a/be/test/olap/tablet_schema_index_test.cpp b/be/test/olap/tablet_schema_index_test.cpp index 33472e9c4f0807..9402eba87d35fa 100644 --- a/be/test/olap/tablet_schema_index_test.cpp +++ b/be/test/olap/tablet_schema_index_test.cpp @@ -166,18 +166,18 @@ TEST_F(TabletSchemaIndexTest, TestUpdateIndexWithMultipleColumns) { TEST_F(TabletSchemaIndexTest, TestRemoveParserAndAnalyzer) { std::map properties = { {INVERTED_INDEX_PARSER_KEY, "english"}, - {INVERTED_INDEX_CUSTOM_ANALYZER_KEY, "my_analyzer"}}; + {INVERTED_INDEX_ANALYZER_NAME_KEY, "my_analyzer"}}; TabletIndex index = create_test_index_with_pb(1, IndexType::INVERTED, {100}, "suffix1", properties); EXPECT_TRUE(index.properties().contains(INVERTED_INDEX_PARSER_KEY)); - EXPECT_TRUE(index.properties().contains(INVERTED_INDEX_CUSTOM_ANALYZER_KEY)); + EXPECT_TRUE(index.properties().contains(INVERTED_INDEX_ANALYZER_NAME_KEY)); index.remove_parser_and_analyzer(); EXPECT_FALSE(index.properties().contains(INVERTED_INDEX_PARSER_KEY)); - EXPECT_FALSE(index.properties().contains(INVERTED_INDEX_CUSTOM_ANALYZER_KEY)); + EXPECT_FALSE(index.properties().contains(INVERTED_INDEX_ANALYZER_NAME_KEY)); } TEST_F(TabletSchemaIndexTest, TestIsSameExceptId) { diff --git a/be/test/vec/function/function_match_test.cpp b/be/test/vec/function/function_match_test.cpp index a6a8cc2e9e3de7..59868492dbe5d9 100644 --- a/be/test/vec/function/function_match_test.cpp +++ b/be/test/vec/function/function_match_test.cpp @@ -582,12 +582,12 @@ TEST(FunctionMatchTest, custom_analyzer_handling) { auto ctx = create_inverted_index_ctx(InvertedIndexParserType::PARSER_ENGLISH); // Test without custom analyzer - ctx.ctx->custom_analyzer = ""; + ctx.ctx->analyzer_name = ""; auto tokens1 = match_any.analyse_query_str_token(ctx.ctx.get(), "test query", "test_col"); EXPECT_GT(tokens1.size(), 0); // Test with custom analyzer (should be handled appropriately) - ctx.ctx->custom_analyzer = "custom_analyzer_name"; + ctx.ctx->analyzer_name = "custom_analyzer_name"; auto tokens2 = match_any.analyse_query_str_token(ctx.ctx.get(), "test query", "test_col"); // Custom analyzer handling would depend on implementation details EXPECT_GE(tokens2.size(), 0); diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 6ce498cf6302cd..a0dbaa6a471534 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -3815,4 +3815,95 @@ TEST(function_string_test, function_sha1_test) { } } +TEST(function_string_test, function_unicode_normalize_nfc_basic) { + std::string func_name = "unicode_normalize"; + + InputTypeSet input_types = { + PrimitiveType::TYPE_VARCHAR, + Consted {PrimitiveType::TYPE_VARCHAR}, + }; + + std::string cafe_decomposed = std::string("Cafe\xCC\x81"); + std::string cafe_composed = std::string("Caf\xC3\xA9"); + + { + DataSet data_set = { + {{cafe_decomposed, std::string("NFC")}, cafe_composed}, + }; + static_cast(check_function(func_name, input_types, data_set)); + } + + { + DataSet data_set = { + {{cafe_composed, std::string("NFC")}, cafe_composed}, + }; + static_cast(check_function(func_name, input_types, data_set)); + } +} + +TEST(function_string_test, function_unicode_normalize_modes_and_trim) { + std::string func_name = "unicode_normalize"; + + InputTypeSet input_types = { + PrimitiveType::TYPE_VARCHAR, + Consted {PrimitiveType::TYPE_VARCHAR}, + }; + + std::string cafe_decomposed = std::string("Cafe\xCC\x81"); + std::string cafe_composed = std::string("Caf\xC3\xA9"); + + { + DataSet data_set = { + {{cafe_composed, std::string(" nFd ")}, cafe_decomposed}, + }; + static_cast(check_function(func_name, input_types, data_set)); + } + + { + DataSet data_set = { + {{std::string("ABC 123"), std::string(" nfkc_cf ")}, std::string("abc 123")}, + }; + static_cast(check_function(func_name, input_types, data_set)); + } + + { + DataSet data_set = { + {{std::string("plain-ascii"), std::string("NFKD")}, std::string("plain-ascii")}, + }; + static_cast(check_function(func_name, input_types, data_set)); + } +} + +TEST(function_string_test, function_unicode_normalize_mode_not_const) { + std::string func_name = "unicode_normalize"; + + InputTypeSet input_types = { + PrimitiveType::TYPE_VARCHAR, + PrimitiveType::TYPE_VARCHAR, + }; + + DataSet data_set = { + {{std::string("abc"), std::string("NFC")}, std::string("abc")}, + }; + + Status st = check_function(func_name, input_types, data_set); + EXPECT_NE(Status::OK(), st); +} + +TEST(function_string_test, function_unicode_normalize_invalid_mode) { + std::string func_name = "unicode_normalize"; + + InputTypeSet input_types = { + PrimitiveType::TYPE_VARCHAR, + Consted {PrimitiveType::TYPE_VARCHAR}, + }; + + DataSet data_set = { + {{std::string("abc"), std::string("INVALID_MODE")}, std::string("abc")}, + }; + + Status st = check_function(func_name, input_types, data_set); + EXPECT_NE(Status::OK(), st); +} + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 8cf376907f0881..c775fb866e17ca 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -376,6 +376,7 @@ ANN: 'ANN'; NO: 'NO'; NO_USE_MV: 'NO_USE_MV'; NON_NULLABLE: 'NON_NULLABLE'; +NORMALIZER: 'NORMALIZER'; NOT: 'NOT'; NULL: 'NULL'; NULLS: 'NULLS'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 116264b1f8120f..34569aefbefc62 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -258,6 +258,8 @@ supportedCreateStatement name=identifier properties=propertyClause? #createIndexTokenFilter | CREATE INVERTED INDEX CHAR_FILTER (IF NOT EXISTS)? name=identifier properties=propertyClause? #createIndexCharFilter + | CREATE INVERTED INDEX NORMALIZER (IF NOT EXISTS)? + name=identifier properties=propertyClause? #createIndexNormalizer ; dictionaryColumnDefs: @@ -344,6 +346,7 @@ supportedDropStatement | DROP INVERTED INDEX TOKENIZER (IF EXISTS)? name=identifier #dropIndexTokenizer | DROP INVERTED INDEX TOKEN_FILTER (IF EXISTS)? name=identifier #dropIndexTokenFilter | DROP INVERTED INDEX CHAR_FILTER (IF EXISTS)? name=identifier #dropIndexCharFilter + | DROP INVERTED INDEX NORMALIZER (IF EXISTS)? name=identifier #dropIndexNormalizer ; supportedShowStatement @@ -483,6 +486,7 @@ supportedLoadStatement | SHOW INVERTED INDEX TOKENIZER #showIndexTokenizer | SHOW INVERTED INDEX TOKEN_FILTER #showIndexTokenFilter | SHOW INVERTED INDEX CHAR_FILTER #showIndexCharFilter + | SHOW INVERTED INDEX NORMALIZER #showIndexNormalizer ; supportedKillStatement @@ -2101,6 +2105,7 @@ nonReserved | NGRAM_BF | NO | NON_NULLABLE + | NORMALIZER | NULLS | OF | OFF diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 40965d3b7b8304..43cef5f3dcdca4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -68,7 +68,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_DICT_COMPRESSION_KEY = "dict_compression"; - public static String INVERTED_INDEX_CUSTOM_ANALYZER_KEY = "analyzer"; + public static String INVERTED_INDEX_ANALYZER_NAME_KEY = "analyzer"; + public static String INVERTED_INDEX_NORMALIZER_NAME_KEY = "normalizer"; public static String INVERTED_INDEX_PARSER_FIELD_PATTERN_KEY = "field_pattern"; @@ -105,11 +106,6 @@ public static boolean getInvertedIndexSupportPhrase(Map properti return supportPhrase != null ? Boolean.parseBoolean(supportPhrase) : true; } - public static String getCustomAnalyzer(Map properties) { - String customAnalyzer = properties == null ? null : properties.get(INVERTED_INDEX_CUSTOM_ANALYZER_KEY); - return customAnalyzer != null ? customAnalyzer : ""; - } - public static Map getInvertedIndexCharFilter(Map properties) { if (properties == null) { return new HashMap<>(); @@ -157,9 +153,18 @@ public static String getInvertedIndexParserStopwords(Map propert return stopwrods != null ? stopwrods : ""; } - public static String getInvertedIndexCustomAnalyzer(Map properties) { - String customAnalyzer = properties == null ? null : properties.get(INVERTED_INDEX_CUSTOM_ANALYZER_KEY); - return customAnalyzer != null ? customAnalyzer : ""; + public static String getInvertedIndexAnalyzerName(Map properties) { + if (properties == null) { + return ""; + } + + String analyzerName = properties.get(INVERTED_INDEX_ANALYZER_NAME_KEY); + if (analyzerName != null && !analyzerName.isEmpty()) { + return analyzerName; + } + + String normalizerName = properties.get(INVERTED_INDEX_NORMALIZER_NAME_KEY); + return normalizerName != null ? normalizerName : ""; } public static void checkInvertedIndexParser(String indexColName, PrimitiveType colType, @@ -226,7 +231,8 @@ public static void checkInvertedIndexProperties(Map properties, INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_STOPWORDS_KEY, INVERTED_INDEX_DICT_COMPRESSION_KEY, - INVERTED_INDEX_CUSTOM_ANALYZER_KEY, + INVERTED_INDEX_ANALYZER_NAME_KEY, + INVERTED_INDEX_NORMALIZER_NAME_KEY, INVERTED_INDEX_PARSER_FIELD_PATTERN_KEY )); @@ -249,20 +255,45 @@ public static void checkInvertedIndexProperties(Map properties, String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY); String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY); String dictCompression = properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY); - String customAnalyzer = properties.get(INVERTED_INDEX_CUSTOM_ANALYZER_KEY); + String analyzerName = properties.get(INVERTED_INDEX_ANALYZER_NAME_KEY); + String normalizerName = properties.get(INVERTED_INDEX_NORMALIZER_NAME_KEY); - if (customAnalyzer != null && !customAnalyzer.isEmpty() && parser != null && !parser.isEmpty()) { - throw new AnalysisException("Cannot specify both 'parser' and 'custom_analyzer' properties"); + int configCount = 0; + if (analyzerName != null && !analyzerName.isEmpty()) { + configCount++; + } + if (parser != null && !parser.isEmpty()) { + configCount++; + } + if (normalizerName != null && !normalizerName.isEmpty()) { + configCount++; } - if (customAnalyzer != null && !customAnalyzer.isEmpty()) { + if (configCount > 1) { + throw new AnalysisException( + "Cannot specify more than one of 'analyzer', 'parser', or 'normalizer' properties. " + + "Please choose only one: " + + "'analyzer' for custom analyzer, " + + "'parser' for built-in parser, " + + "or 'normalizer' for text normalization without tokenization."); + } + + if (analyzerName != null && !analyzerName.isEmpty()) { try { - Env.getCurrentEnv().getIndexPolicyMgr().validateAnalyzerExists(customAnalyzer); + Env.getCurrentEnv().getIndexPolicyMgr().validateAnalyzerExists(analyzerName); } catch (DdlException e) { throw new AnalysisException("Invalid custom analyzer: " + e.getMessage()); } } + if (normalizerName != null && !normalizerName.isEmpty()) { + try { + Env.getCurrentEnv().getIndexPolicyMgr().validateNormalizerExists(normalizerName); + } catch (DdlException e) { + throw new AnalysisException("Invalid normalizer: " + e.getMessage()); + } + } + if (parser != null && !parser.matches("none|english|unicode|chinese|standard|icu|basic|ik")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser + ", parser must be none, english, unicode, chinese, icu, basic or ik"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index 772f7fcbc33187..116ffc99201aea 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -80,7 +80,7 @@ public TExprOpcode getOpcode() { private Map invertedIndexCharFilter; private boolean invertedIndexParserLowercase = true; private String invertedIndexParserStopwords = ""; - private String invertedIndexCustomAnalyzer = ""; + private String invertedIndexAnalyzerName = ""; private MatchPredicate() { // use for serde only @@ -105,7 +105,7 @@ public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType, this.invertedIndexCharFilter = invertedIndex.getInvertedIndexCharFilter(); this.invertedIndexParserLowercase = invertedIndex.getInvertedIndexParserLowercase(); this.invertedIndexParserStopwords = invertedIndex.getInvertedIndexParserStopwords(); - this.invertedIndexCustomAnalyzer = invertedIndex.getInvertedIndexCustomAnalyzer(); + this.invertedIndexAnalyzerName = invertedIndex.getInvertedIndexAnalyzerName(); } fn = new Function(new FunctionName(op.name), Lists.newArrayList(e1.getType(), e2.getType()), retType, false, true, nullableMode); @@ -119,7 +119,7 @@ protected MatchPredicate(MatchPredicate other) { invertedIndexCharFilter = other.invertedIndexCharFilter; invertedIndexParserLowercase = other.invertedIndexParserLowercase; invertedIndexParserStopwords = other.invertedIndexParserStopwords; - invertedIndexCustomAnalyzer = other.invertedIndexCustomAnalyzer; + invertedIndexAnalyzerName = other.invertedIndexAnalyzerName; } @Override @@ -159,7 +159,7 @@ protected void toThrift(TExprNode msg) { msg.match_predicate.setCharFilterMap(invertedIndexCharFilter); msg.match_predicate.setParserLowercase(invertedIndexParserLowercase); msg.match_predicate.setParserStopwords(invertedIndexParserStopwords); - msg.match_predicate.setCustomAnalyzer(invertedIndexCustomAnalyzer); + msg.match_predicate.setAnalyzerName(invertedIndexAnalyzerName); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index c07bdb3bfe87ea..4d3e0e07ac12c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -510,6 +510,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Uncompress; import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex; import org.apache.doris.nereids.trees.expressions.functions.scalar.UnhexNull; +import org.apache.doris.nereids.trees.expressions.functions.scalar.UnicodeNormalize; import org.apache.doris.nereids.trees.expressions.functions.scalar.Uniform; import org.apache.doris.nereids.trees.expressions.functions.scalar.UnixTimestamp; import org.apache.doris.nereids.trees.expressions.functions.scalar.Upper; @@ -1059,6 +1060,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(UnixTimestamp.class, "unix_timestamp"), scalar(Upper.class, "ucase", "upper"), scalar(Uncompress.class, "uncompress"), + scalar(UnicodeNormalize.class, "unicode_normalize"), scalar(Uniform.class, "uniform"), scalar(UrlDecode.class, "url_decode"), scalar(UrlEncode.class, "url_encode"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java index 35f4c2a14e4350..4e0254d6dc6ebc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java @@ -79,7 +79,8 @@ public Index(long indexId, String indexName, List columns, if (this.properties != null && !this.properties.isEmpty()) { if (this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY) || this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY_ALIAS) - || this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_CUSTOM_ANALYZER_KEY)) { + || this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY) + || this.properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY)) { String supportPhraseKey = InvertedIndexUtil .INVERTED_INDEX_SUPPORT_PHRASE_KEY; if (!this.properties.containsKey(supportPhraseKey)) { @@ -210,8 +211,8 @@ public boolean isLightAddIndexSupported(boolean enableAddIndexForNewData) { || (indexType == IndexDefinition.IndexType.INVERTED) || (indexType == IndexDefinition.IndexType.ANN); } - public String getInvertedIndexCustomAnalyzer() { - return InvertedIndexUtil.getInvertedIndexCustomAnalyzer(properties); + public String getInvertedIndexAnalyzerName() { + return InvertedIndexUtil.getInvertedIndexAnalyzerName(properties); } public String getComment() { @@ -382,11 +383,16 @@ public static void checkConflict(Collection indices, Set bloomFil } } + /** + * Returns whether this index is an analyzed inverted index, + * i.e. an inverted index with parser/analyzer/normalizer properties. + */ public boolean isAnalyzedInvertedIndex() { return indexType == IndexDefinition.IndexType.INVERTED && properties != null && (properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY) || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY_ALIAS) - || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_CUSTOM_ANALYZER_KEY)); + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY) + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerCharFilterValidator.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerCharFilterValidator.java new file mode 100644 index 00000000000000..5dd070d312c119 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerCharFilterValidator.java @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.indexpolicy; + +import org.apache.doris.common.DdlException; + +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.Set; + +public class ICUNormalizerCharFilterValidator extends BasePolicyValidator { + private static final Set ALLOWED_PROPS = ImmutableSet.of( + "type", "name", "mode", "unicode_set_filter"); + + private static final Set VALID_NAMES = ImmutableSet.of( + "nfc", "nfd", "nfkc", "nfkd", "nfkc_cf"); + + private static final Set VALID_MODES = ImmutableSet.of( + "compose", "decompose"); + + public ICUNormalizerCharFilterValidator() { + super(ALLOWED_PROPS); + } + + @Override + protected String getTypeName() { + return "ICU normalizer char filter"; + } + + @Override + protected void validateSpecific(Map props) throws DdlException { + if (props.containsKey("name")) { + String name = props.get("name").toLowerCase(); + if (!VALID_NAMES.contains(name)) { + throw new DdlException("Invalid name '" + name + "' for ICU normalizer char filter. " + + "Supported names: " + VALID_NAMES + " (default: nfkc_cf)"); + } + } + if (props.containsKey("mode")) { + String mode = props.get("mode").toLowerCase(); + if (!VALID_MODES.contains(mode)) { + throw new DdlException("Invalid mode '" + mode + "' for ICU normalizer char filter. " + + "Supported modes: " + VALID_MODES + " (default: compose)"); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerTokenFilterValidator.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerTokenFilterValidator.java new file mode 100644 index 00000000000000..5de774055c974c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/ICUNormalizerTokenFilterValidator.java @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.indexpolicy; + +import org.apache.doris.common.DdlException; + +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.Set; + +public class ICUNormalizerTokenFilterValidator extends BasePolicyValidator { + private static final Set ALLOWED_PROPS = + ImmutableSet.of("type", "name", "unicode_set_filter"); + private static final Set VALID_NAMES = ImmutableSet.of( + "nfc", "nfd", "nfkc", "nfkd", "nfkc_cf"); + + public ICUNormalizerTokenFilterValidator() { + super(ALLOWED_PROPS); + } + + @Override + protected String getTypeName() { + return "ICU normalizer filter"; + } + + @Override + protected void validateSpecific(Map props) throws DdlException { + if (props.containsKey("name")) { + String name = props.get("name").toLowerCase(); + if (!VALID_NAMES.contains(name)) { + throw new DdlException("Invalid name '" + name + "' for ICU normalizer filter. " + + "Supported names: " + VALID_NAMES + " (default: nfkc_cf)"); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java index 5242fdbf9dcf00..fb0338c8ccf460 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java @@ -54,6 +54,7 @@ public class IndexPolicy implements Writable, GsonPostProcessable { public static final String PROP_TYPE = "type"; public static final String PROP_ANALYZER = "analyzer"; + public static final String PROP_NORMALIZER = "normalizer"; public static final String PROP_TOKENIZER = "tokenizer"; public static final String PROP_TOKEN_FILTER = "token_filter"; public static final String PROP_CHAR_FILTER = "char_filter"; @@ -62,14 +63,16 @@ public class IndexPolicy implements Writable, GsonPostProcessable { "empty", "ngram", "edge_ngram", "keyword", "standard", "char_group", "basic", "icu", "pinyin"); public static final Set BUILTIN_TOKEN_FILTERS = ImmutableSet.of( - "empty", "asciifolding", "word_delimiter", "lowercase", "pinyin"); + "empty", "asciifolding", "word_delimiter", "lowercase", "pinyin", "icu_normalizer"); public static final Set BUILTIN_CHAR_FILTERS = ImmutableSet.of( - "empty", "char_replace"); + "empty", "char_replace", "icu_normalizer"); public static final Set BUILTIN_ANALYZERS = ImmutableSet.of( "none", "standard", "unicode", "english", "chinese", "icu", "basic", "ik"); + public static final Set BUILTIN_NORMALIZERS = ImmutableSet.of("lowercase"); + private static final Logger LOG = LogManager.getLogger(IndexPolicy.class); @SerializedName(value = "id") diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java index fb84f7392e0bb8..49f6ac985e5ade 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java @@ -83,7 +83,6 @@ public List getCopiedIndexPolicies() { } public void validateAnalyzerExists(String analyzerName) throws DdlException { - // Allow built-in analyzers if (IndexPolicy.BUILTIN_ANALYZERS.contains(analyzerName)) { return; } @@ -105,6 +104,28 @@ public void validateAnalyzerExists(String analyzerName) throws DdlException { } } + public void validateNormalizerExists(String normalizerName) throws DdlException { + if (IndexPolicy.BUILTIN_NORMALIZERS.contains(normalizerName)) { + return; + } + + readLock(); + try { + IndexPolicy policy = nameToIndexPolicy.get(normalizerName); + if (policy == null) { + throw new DdlException("Normalizer '" + normalizerName + "' does not exist"); + } + if (policy.getType() != IndexPolicyTypeEnum.NORMALIZER) { + throw new DdlException("Policy '" + normalizerName + "' is not a normalizer"); + } + if (policy.isInvalid()) { + throw new DdlException("Normalizer '" + normalizerName + "' is invalid"); + } + } finally { + readUnlock(); + } + } + public void createIndexPolicy(boolean ifNotExists, String policyName, IndexPolicyTypeEnum type, Map properties) throws UserException { if (policyName == null || policyName.trim().isEmpty()) { @@ -177,6 +198,9 @@ private void validatePolicyProperties(IndexPolicyTypeEnum type, Map properties) throws D } } + private void validateNormalizerProperties(Map properties) throws DdlException { + if (properties.containsKey(IndexPolicy.PROP_TOKENIZER)) { + throw new DdlException("Normalizer cannot contain 'tokenizer' field"); + } + + String charFilters = properties.get(IndexPolicy.PROP_CHAR_FILTER); + String tokenFilters = properties.get(IndexPolicy.PROP_TOKEN_FILTER); + + if ((charFilters == null || charFilters.isEmpty()) + && (tokenFilters == null || tokenFilters.isEmpty())) { + throw new DdlException("Normalizer must contain at least one 'char_filter' or 'token_filter'"); + } + + if (charFilters != null && !charFilters.isEmpty()) { + for (String filter : charFilters.split(",\\s*")) { + validatePolicyReference(filter, IndexPolicyTypeEnum.CHAR_FILTER); + } + } + + if (tokenFilters != null && !tokenFilters.isEmpty()) { + for (String filter : tokenFilters.split(",\\s*")) { + validatePolicyReference(filter, IndexPolicyTypeEnum.TOKEN_FILTER); + } + } + + for (String key : properties.keySet()) { + if (!key.equals(IndexPolicy.PROP_CHAR_FILTER) + && !key.equals(IndexPolicy.PROP_TOKEN_FILTER)) { + throw new DdlException("Invalid normalizer property: '" + key + "'. Only '" + + IndexPolicy.PROP_CHAR_FILTER + "' and '" + IndexPolicy.PROP_TOKEN_FILTER + + "' are allowed."); + } + } + } + private void validatePolicyReference(String name, IndexPolicyTypeEnum expectedType) throws DdlException { if (expectedType == IndexPolicyTypeEnum.TOKENIZER @@ -299,6 +358,9 @@ private void validateTokenFilterProperties(Map properties) throw case "pinyin": validator = new PinyinTokenFilterValidator(); break; + case "icu_normalizer": + validator = new ICUNormalizerTokenFilterValidator(); + break; default: Set userFacingTypes = IndexPolicy.BUILTIN_TOKEN_FILTERS.stream() .filter(t -> !t.equals("empty")) @@ -322,6 +384,9 @@ private void validateCharFilterProperties(Map properties) throws case "char_replace": validator = new CharReplaceCharFilterValidator(); break; + case "icu_normalizer": + validator = new ICUNormalizerCharFilterValidator(); + break; default: Set userFacingTypes = IndexPolicy.BUILTIN_CHAR_FILTERS.stream() .filter(t -> !t.equals("empty")) @@ -345,6 +410,8 @@ public void dropIndexPolicy(boolean isIfExists, String indexPolicyName, } if (policyToDrop.getType() == IndexPolicyTypeEnum.ANALYZER) { checkAnalyzerNotUsedByIndex(policyToDrop.getName()); + } else if (policyToDrop.getType() == IndexPolicyTypeEnum.NORMALIZER) { + checkNormalizerNotUsedByIndex(policyToDrop.getName()); } if (policyToDrop.getType() == IndexPolicyTypeEnum.TOKENIZER || policyToDrop.getType() == IndexPolicyTypeEnum.TOKEN_FILTER @@ -382,41 +449,67 @@ private void checkAnalyzerNotUsedByIndex(String analyzerName) throws DdlExceptio } } + private void checkNormalizerNotUsedByIndex(String normalizerName) throws DdlException { + List databases = Env.getCurrentEnv().getInternalCatalog().getDbs(); + for (Database db : databases) { + List tables = db.getTables(); + for (Table table : tables) { + if (table instanceof OlapTable) { + OlapTable olapTable = (OlapTable) table; + for (Index index : olapTable.getIndexes()) { + Map properties = index.getProperties(); + if (properties != null + && normalizerName.equals(properties.get(IndexPolicy.PROP_NORMALIZER))) { + throw new DdlException("the normalizer " + normalizerName + " is used by index: " + + index.getIndexName() + " in table: " + + db.getFullName() + "." + table.getName()); + } + } + } + } + } + } + private void checkPolicyNotReferenced(IndexPolicy policy) throws DdlException { String policyName = policy.getName(); IndexPolicyTypeEnum policyType = policy.getType(); - for (IndexPolicy analyzerPolicy : idToIndexPolicy.values()) { - if (analyzerPolicy.getType() == IndexPolicyTypeEnum.ANALYZER) { - Map properties = analyzerPolicy.getProperties(); - if (policyType == IndexPolicyTypeEnum.TOKENIZER) { - String tokenizer = properties.get(IndexPolicy.PROP_TOKENIZER); - if (policyName.equals(tokenizer)) { - throw new DdlException("Cannot drop " + policyType + " policy '" + policyName - + "' as it is referenced by ANALYZER policy '" - + analyzerPolicy.getName() + "'"); - } - } else if (policyType == IndexPolicyTypeEnum.TOKEN_FILTER) { - String tokenFilters = properties.get(IndexPolicy.PROP_TOKEN_FILTER); - if (tokenFilters != null && !tokenFilters.isEmpty()) { - for (String filter : tokenFilters.split(",\\s*")) { - if (policyName.equals(filter)) { - throw new DdlException("Cannot drop " + policyType + " policy '" - + policyName + "' as it is referenced by ANALYZER policy '" - + analyzerPolicy.getName() + "'"); - } - } - } - } else if (policyType == IndexPolicyTypeEnum.CHAR_FILTER) { - String charFilters = properties.get(IndexPolicy.PROP_CHAR_FILTER); - if (charFilters != null && !charFilters.isEmpty()) { - for (String filter : charFilters.split(",\\s*")) { - if (policyName.equals(filter)) { - throw new DdlException("Cannot drop " + policyType + " policy '" - + policyName + "' as it is referenced by ANALYZER policy '" - + analyzerPolicy.getName() + "'"); - } - } - } + + for (IndexPolicy otherPolicy : idToIndexPolicy.values()) { + IndexPolicyTypeEnum otherType = otherPolicy.getType(); + + if (otherType != IndexPolicyTypeEnum.ANALYZER + && otherType != IndexPolicyTypeEnum.NORMALIZER) { + continue; + } + + Map properties = otherPolicy.getProperties(); + if (policyType == IndexPolicyTypeEnum.TOKENIZER + && otherType == IndexPolicyTypeEnum.ANALYZER) { + String tokenizer = properties.get(IndexPolicy.PROP_TOKENIZER); + if (policyName.equals(tokenizer)) { + throw new DdlException("Cannot drop " + policyType + " policy '" + policyName + + "' as it is referenced by " + otherType + " policy '" + + otherPolicy.getName() + "'"); + } + } else if (policyType == IndexPolicyTypeEnum.TOKEN_FILTER) { + checkFilterReference(policyName, policyType, otherType, otherPolicy, + properties.get(IndexPolicy.PROP_TOKEN_FILTER)); + } else if (policyType == IndexPolicyTypeEnum.CHAR_FILTER) { + checkFilterReference(policyName, policyType, otherType, otherPolicy, + properties.get(IndexPolicy.PROP_CHAR_FILTER)); + } + } + } + + private void checkFilterReference(String policyName, IndexPolicyTypeEnum policyType, + IndexPolicyTypeEnum referencingType, IndexPolicy referencingPolicy, + String filterList) throws DdlException { + if (filterList != null && !filterList.isEmpty()) { + for (String filter : filterList.split(",\\s*")) { + if (policyName.equals(filter)) { + throw new DdlException("Cannot drop " + policyType + " policy '" + policyName + + "' as it is referenced by " + referencingType + " policy '" + + referencingPolicy.getName() + "'"); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java index acda67c9b8c0a9..86420e3c0fa133 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyTypeEnum.java @@ -23,7 +23,7 @@ * Index policy type enum. **/ public enum IndexPolicyTypeEnum { - ANALYZER, TOKENIZER, TOKEN_FILTER, CHAR_FILTER; + ANALYZER, TOKENIZER, TOKEN_FILTER, CHAR_FILTER, NORMALIZER; public TIndexPolicyType toThrift() { switch (this) { @@ -31,6 +31,7 @@ public TIndexPolicyType toThrift() { case TOKENIZER: return TIndexPolicyType.TOKENIZER; case TOKEN_FILTER: return TIndexPolicyType.TOKEN_FILTER; case CHAR_FILTER: return TIndexPolicyType.CHAR_FILTER; + case NORMALIZER: return TIndexPolicyType.NORMALIZER; default: throw new IllegalStateException("Unknown type: " + this); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index f9c3b2e2b2e753..dd6ff3d97b9ea4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -191,6 +191,7 @@ import org.apache.doris.nereids.DorisParser.DropIndexCharFilterContext; import org.apache.doris.nereids.DorisParser.DropIndexClauseContext; import org.apache.doris.nereids.DorisParser.DropIndexContext; +import org.apache.doris.nereids.DorisParser.DropIndexNormalizerContext; import org.apache.doris.nereids.DorisParser.DropIndexTokenFilterContext; import org.apache.doris.nereids.DorisParser.DropIndexTokenizerContext; import org.apache.doris.nereids.DorisParser.DropMVContext; @@ -387,6 +388,7 @@ import org.apache.doris.nereids.DorisParser.ShowGrantsForUserContext; import org.apache.doris.nereids.DorisParser.ShowIndexAnalyzerContext; import org.apache.doris.nereids.DorisParser.ShowIndexCharFilterContext; +import org.apache.doris.nereids.DorisParser.ShowIndexNormalizerContext; import org.apache.doris.nereids.DorisParser.ShowIndexTokenFilterContext; import org.apache.doris.nereids.DorisParser.ShowIndexTokenizerContext; import org.apache.doris.nereids.DorisParser.ShowLastInsertContext; @@ -670,6 +672,7 @@ import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexCharFilterCommand; +import org.apache.doris.nereids.trees.plans.commands.CreateIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateJobCommand; @@ -706,6 +709,7 @@ import org.apache.doris.nereids.trees.plans.commands.DropFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexCharFilterCommand; +import org.apache.doris.nereids.trees.plans.commands.DropIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropJobCommand; @@ -807,6 +811,7 @@ import org.apache.doris.nereids.trees.plans.commands.ShowIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCommand; +import org.apache.doris.nereids.trees.plans.commands.ShowIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexStatsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenizerCommand; @@ -9168,6 +9173,17 @@ public LogicalPlan visitCreateIndexAnalyzer(CreateIndexAnalyzerContext ctx) { return new CreateIndexAnalyzerCommand(ifNotExists, policyName, properties); } + @Override + public LogicalPlan visitCreateIndexNormalizer(DorisParser.CreateIndexNormalizerContext ctx) { + boolean ifNotExists = ctx.IF() != null && ctx.NOT() != null && ctx.EXISTS() != null; + String normalizerName = ctx.name.getText(); + Map properties = ctx.properties != null + ? visitPropertyClause(ctx.properties) + : Maps.newHashMap(); + + return new CreateIndexNormalizerCommand(ifNotExists, normalizerName, properties); + } + @Override public LogicalPlan visitCreateIndexTokenizer(CreateIndexTokenizerContext ctx) { boolean ifNotExists = ctx.IF() != null; @@ -9203,6 +9219,14 @@ public LogicalPlan visitDropIndexAnalyzer(DropIndexAnalyzerContext ctx) { return new DropIndexAnalyzerCommand(policyName, ifExists); } + @Override + public LogicalPlan visitDropIndexNormalizer(DropIndexNormalizerContext ctx) { + String policyName = ctx.name.getText(); + boolean ifExists = ctx.IF() != null; + + return new DropIndexNormalizerCommand(policyName, ifExists); + } + @Override public LogicalPlan visitDropIndexTokenizer(DropIndexTokenizerContext ctx) { String policyName = ctx.name.getText(); @@ -9232,6 +9256,11 @@ public LogicalPlan visitShowIndexAnalyzer(ShowIndexAnalyzerContext ctx) { return new ShowIndexAnalyzerCommand(); } + @Override + public LogicalPlan visitShowIndexNormalizer(ShowIndexNormalizerContext ctx) { + return new ShowIndexNormalizerCommand(); + } + @Override public LogicalPlan visitShowIndexTokenizer(ShowIndexTokenizerContext ctx) { return new ShowIndexTokenizerCommand(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UnicodeNormalize.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UnicodeNormalize.java new file mode 100644 index 00000000000000..92e630fdcb873b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UnicodeNormalize.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'unicode_normalize'. + */ +public class UnicodeNormalize extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT) + .args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(StringType.INSTANCE) + .args(StringType.INSTANCE, StringType.INSTANCE) + ); + + public UnicodeNormalize(Expression arg0, Expression arg1) { + super("unicode_normalize", arg0, arg1); + } + + private UnicodeNormalize(ScalarFunctionParams functionParams) { + super(functionParams); + } + + @Override + public UnicodeNormalize withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new UnicodeNormalize(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitUnicodeNormalize(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 9e2a9708f3742c..cb382bc1409aa3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -511,6 +511,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Uncompress; import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex; import org.apache.doris.nereids.trees.expressions.functions.scalar.UnhexNull; +import org.apache.doris.nereids.trees.expressions.functions.scalar.UnicodeNormalize; import org.apache.doris.nereids.trees.expressions.functions.scalar.Uniform; import org.apache.doris.nereids.trees.expressions.functions.scalar.UnixTimestamp; import org.apache.doris.nereids.trees.expressions.functions.scalar.Upper; @@ -2680,4 +2681,8 @@ default R visitPeriodAdd(PeriodAdd periodAdd, C context) { default R visitPeriodDiff(PeriodDiff periodDiff, C context) { return visitScalarFunction(periodDiff, context); } + + default R visitUnicodeNormalize(UnicodeNormalize func, C context) { + return visitScalarFunction(func, context); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java index 181b7e4083a578..6eec63cc2c47d6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java @@ -461,5 +461,8 @@ public enum PlanType { CREATE_INDEX_CHAR_FILTER_COMMAND, DROP_INDEX_CHAR_FILTER_COMMAND, SHOW_INDEX_CHAR_FILTER_COMMAND, + CREATE_INDEX_NORMALIZER_COMMAND, + DROP_INDEX_NORMALIZER_COMMAND, + SHOW_INDEX_NORMALIZER_COMMAND, EMPTY_COMMAND } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexNormalizerCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexNormalizerCommand.java new file mode 100644 index 00000000000000..932daae8c04585 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexNormalizerCommand.java @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.FeNameFormat; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.StmtExecutor; + +import java.util.Map; + +/** + * CREATE INVERTED INDEX NORMALIZER [IF NOT EXISTS] policy_name PROPERTIES (key1 = value1, ...) + */ +public class CreateIndexNormalizerCommand extends Command implements ForwardWithSync { + private final boolean ifNotExists; + private final String normalizerName; + private final Map properties; + + public CreateIndexNormalizerCommand(boolean ifNotExists, String normalizerName, + Map properties) { + super(PlanType.CREATE_INDEX_NORMALIZER_COMMAND); + this.ifNotExists = ifNotExists; + this.normalizerName = normalizerName; + this.properties = properties; + } + + @Override + public void run(ConnectContext ctx, StmtExecutor executor) throws Exception { + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), + PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, + "ADMIN"); + } + + FeNameFormat.checkCommonName("normalizer", normalizerName); + + Env.getCurrentEnv().getIndexPolicyMgr().createIndexPolicy( + ifNotExists, normalizerName, IndexPolicyTypeEnum.NORMALIZER, properties); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitCommand(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexNormalizerCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexNormalizerCommand.java new file mode 100644 index 00000000000000..1ac7d738e65e9c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexNormalizerCommand.java @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.FeNameFormat; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.StmtExecutor; + +/** + * DROP INVERTED INDEX NORMALIZER [IF EXISTS] policy_name + **/ +public class DropIndexNormalizerCommand extends DropCommand { + private final boolean ifExists; + private final String name; + + public DropIndexNormalizerCommand(String name, boolean ifExists) { + super(PlanType.DROP_INDEX_NORMALIZER_COMMAND); + this.name = name; + this.ifExists = ifExists; + } + + @Override + public void doRun(ConnectContext ctx, StmtExecutor executor) throws Exception { + // check auth + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), + PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); + } + + // check name + FeNameFormat.checkIndexPolicyName(name); + + Env.getCurrentEnv().getIndexPolicyMgr().dropIndexPolicy(ifExists, name, + IndexPolicyTypeEnum.NORMALIZER); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitDropIndexNormalizerCommand(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexNormalizerCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexNormalizerCommand.java new file mode 100644 index 00000000000000..322f535a0c7a6c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexNormalizerCommand.java @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.commands; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.indexpolicy.IndexPolicy; +import org.apache.doris.indexpolicy.IndexPolicyTypeEnum; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.nereids.trees.plans.PlanType; +import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ShowResultSet; +import org.apache.doris.qe.ShowResultSetMetaData; +import org.apache.doris.qe.StmtExecutor; + +/** + * SHOW INVERTED INDEX NORMALIZER; + **/ +public class ShowIndexNormalizerCommand extends ShowCommand { + public ShowIndexNormalizerCommand() { + super(PlanType.SHOW_INDEX_NORMALIZER_COMMAND); + } + + @Override + public ShowResultSet doRun(ConnectContext ctx, StmtExecutor executor) throws Exception { + // check auth + if (!Env.getCurrentEnv().getAccessManager().checkGlobalPriv(ConnectContext.get(), + PrivPredicate.ADMIN)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); + } + + return Env.getCurrentEnv().getIndexPolicyMgr().showIndexPolicy(IndexPolicyTypeEnum.NORMALIZER); + } + + @Override + public R accept(PlanVisitor visitor, C context) { + return visitor.visitShowIndexNormalizerCommand(this, context); + } + + @Override + public ShowResultSetMetaData getMetaData() { + return IndexPolicy.INDEX_POLICY_META_DATA; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java index 828b7c4e58202c..4a9f604c9cfdce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java @@ -212,7 +212,8 @@ public void checkColumn(ColumnDefinition column, KeysType keysType, && indexType == IndexType.INVERTED && properties != null && (properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY) || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY_ALIAS) - || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_CUSTOM_ANALYZER_KEY))) { + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY) + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY))) { throw new AnalysisException("INVERTED index with parser can NOT be used in value columns of" + " UNIQUE_KEYS table with merge_on_write disable. invalid index: " + name); } @@ -491,11 +492,16 @@ private void parseAndValidateProperty(Map properties, String key } } + /** + * Returns whether this index is an analyzed inverted index, + * i.e. an inverted index with parser/analyzer/normalizer properties. + */ public boolean isAnalyzedInvertedIndex() { return indexType == IndexType.INVERTED && properties != null - && (properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY) - || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY_ALIAS) - || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_CUSTOM_ANALYZER_KEY)); + && (properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY) + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_PARSER_KEY_ALIAS) + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY) + || properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java index 3fb4b826d83acc..aafc928ab8b053 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/visitor/CommandVisitor.java @@ -80,6 +80,7 @@ import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexCharFilterCommand; +import org.apache.doris.nereids.trees.plans.commands.CreateIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.CreateIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.CreateJobCommand; @@ -115,6 +116,7 @@ import org.apache.doris.nereids.trees.plans.commands.DropFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexCharFilterCommand; +import org.apache.doris.nereids.trees.plans.commands.DropIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.DropIndexTokenizerCommand; import org.apache.doris.nereids.trees.plans.commands.DropJobCommand; @@ -216,6 +218,7 @@ import org.apache.doris.nereids.trees.plans.commands.ShowIndexAnalyzerCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCharFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexCommand; +import org.apache.doris.nereids.trees.plans.commands.ShowIndexNormalizerCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexStatsCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenFilterCommand; import org.apache.doris.nereids.trees.plans.commands.ShowIndexTokenizerCommand; @@ -1423,6 +1426,11 @@ default R visitCreateIndexAnalyzerCommand( return visitCommand(createIndexAnalyzerCommand, context); } + default R visitCreateIndexNormalizerCommand( + CreateIndexNormalizerCommand createIndexNormalizerCommand, C context) { + return visitCommand(createIndexNormalizerCommand, context); + } + default R visitCreateIndexCharFilterCommand( CreateIndexCharFilterCommand createIndexCharFilterCommand, C context) { return visitCommand(createIndexCharFilterCommand, context); @@ -1443,6 +1451,11 @@ default R visitDropIndexAnalyzerCommand( return visitCommand(dropIndexAnalyzerCommand, context); } + default R visitDropIndexNormalizerCommand( + DropIndexNormalizerCommand dropIndexNormalizerCommand, C context) { + return visitCommand(dropIndexNormalizerCommand, context); + } + default R visitDropIndexCharFilterCommand( DropIndexCharFilterCommand dropIndexCharFilterCommand, C context) { return visitCommand(dropIndexCharFilterCommand, context); @@ -1467,6 +1480,11 @@ default R visitShowIndexAnalyzerCommand( return visitCommand(showIndexAnalyzerCommand, context); } + default R visitShowIndexNormalizerCommand( + ShowIndexNormalizerCommand showIndexNormalizerCommand, C context) { + return visitCommand(showIndexNormalizerCommand, context); + } + default R visitShowIndexCharFilterCommand( ShowIndexCharFilterCommand showIndexCharFilterCommand, C context) { return visitCommand(showIndexCharFilterCommand, context); diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index 6870263c51d9fc..f32273dc73e6a8 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -139,7 +139,8 @@ enum TIndexPolicyType { ANALYZER, TOKENIZER, TOKEN_FILTER, - CHAR_FILTER + CHAR_FILTER, + NORMALIZER } struct TIndexPolicy { diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index 3c69243d537f04..137818634cbc47 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -171,7 +171,7 @@ struct TMatchPredicate { 3: optional map char_filter_map; 4: optional bool parser_lowercase = true; 5: optional string parser_stopwords = ""; - 6: optional string custom_analyzer = ""; + 6: optional string analyzer_name = ""; } struct TLiteralPredicate { diff --git a/regression-test/data/inverted_index_p0/analyzer/test_custom_normalizer.out b/regression-test/data/inverted_index_p0/analyzer/test_custom_normalizer.out new file mode 100644 index 00000000000000..ebccdec7789ed0 --- /dev/null +++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_normalizer.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !tokenize_normalizer_nfc_1 -- +[{\n "token": "café"\n }] + +-- !tokenize_normalizer_nfc_2 -- +[{\n "token": "café"\n }] + +-- !tokenize_normalizer_alpha -- +[{\n "token": "café 123"\n }] + +-- !sql_match_cafe -- +1 Café +2 Café +3 CAFÉ + diff --git a/regression-test/data/inverted_index_p0/analyzer/test_unicode_normalize.out b/regression-test/data/inverted_index_p0/analyzer/test_unicode_normalize.out new file mode 100644 index 00000000000000..f2245845988f9f --- /dev/null +++ b/regression-test/data/inverted_index_p0/analyzer/test_unicode_normalize.out @@ -0,0 +1,19 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !unicode_normalize_nfc_1 -- +436166C3A9 + +-- !unicode_normalize_nfc_2 -- +436166C3A9 + +-- !unicode_normalize_nfd -- +43616665CC81 + +-- !unicode_normalize_nfkc_cf -- +abc 123 + +-- !unicode_normalize_nfkd_ascii -- +plain-ascii + +-- !unicode_normalize_nulls -- +\N + diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_custom_normalizer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_custom_normalizer.groovy new file mode 100644 index 00000000000000..bac9b0ff4d8e0b --- /dev/null +++ b/regression-test/suites/inverted_index_p0/analyzer/test_custom_normalizer.groovy @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.sql.SQLException + +suite("test_custom_normalizer", "p0") { + sql """ + CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS nfkc_cf_token_filter + PROPERTIES + ( + "type" = "icu_normalizer", + "name" = "nfkc_cf" + ); + """ + + sql """ + CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS nfkc_cf_alpha_token_filter + PROPERTIES + ( + "type" = "icu_normalizer", + "name" = "nfkc_cf", + "unicode_set_filter" = "[A-Za-z]" + ); + """ + + sql """ + CREATE INVERTED INDEX NORMALIZER IF NOT EXISTS nfkc_cf_normalizer + PROPERTIES + ( + "token_filter" = "nfkc_cf_token_filter" + ); + """ + + sql """ + CREATE INVERTED INDEX NORMALIZER IF NOT EXISTS nfkc_cf_alpha_normalizer + PROPERTIES + ( + "token_filter" = "nfkc_cf_alpha_token_filter" + ); + """ + + sql """ select sleep(10) """ + + qt_tokenize_normalizer_nfc_1 """ + select tokenize( + cast(unhex('43616665CC81') as string), + '"normalizer"="nfkc_cf_normalizer"' + ); + """ + + qt_tokenize_normalizer_nfc_2 """ + select tokenize( + cast(unhex('436166C3A9') as string), + '"normalizer"="nfkc_cf_normalizer"' + ); + """ + + qt_tokenize_normalizer_alpha """ + select tokenize( + 'Café 123', + '"normalizer"="nfkc_cf_alpha_normalizer"' + ); + """ + + sql "DROP TABLE IF EXISTS test_custom_normalizer_tbl" + sql """ + CREATE TABLE test_custom_normalizer_tbl ( + `id` bigint NOT NULL, + `content` text NULL, + INDEX idx_content (`content`) USING INVERTED + PROPERTIES("normalizer" = "nfkc_cf_normalizer") + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + INSERT INTO test_custom_normalizer_tbl VALUES + (1, cast(unhex('43616665CC81') as string)), -- Cafe + 组合重音 + (2, cast(unhex('436166C3A9') as string)), -- Caf + 预组合 é + (3, 'CAFÉ'); + """ + + try { + sql "sync" + sql """ set enable_common_expr_pushdown = true; """ + + qt_sql_match_cafe """ + SELECT id, content + FROM test_custom_normalizer_tbl + WHERE content MATCH 'café' + ORDER BY id; + """ + } finally { + } + + test { + sql """ + CREATE TABLE test_custom_normalizer_invalid ( + `id` int, + `content` text, + INDEX idx (`content`) USING INVERTED + PROPERTIES( + "analyzer" = "basic", + "normalizer" = "nfkc_cf_normalizer" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES("replication_allocation" = "tag.location.default: 1"); + """ + exception "Cannot specify more than one of 'analyzer', 'parser', or 'normalizer'" + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_unicode_normalize.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_unicode_normalize.groovy new file mode 100644 index 00000000000000..aecef862b3ef94 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/analyzer/test_unicode_normalize.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_unicode_normalize", "p0") { + sql "set batch_size = 4096;" + + qt_unicode_normalize_nfc_1 """ + select hex(unicode_normalize(cast(unhex('43616665CC81') as string), 'NFC')); + """ + qt_unicode_normalize_nfc_2 """ + select hex(unicode_normalize(cast(unhex('436166C3A9') as string), 'NFC')); + """ + + qt_unicode_normalize_nfd """ + select hex(unicode_normalize(cast(unhex('436166C3A9') as string), ' nFd ')); + """ + + qt_unicode_normalize_nfkc_cf """ + select unicode_normalize('ABC 123', ' nfkc_cf '); + """ + + qt_unicode_normalize_nfkd_ascii """ + select unicode_normalize('plain-ascii', 'NFKD'); + """ + + qt_unicode_normalize_nulls """ + select + unicode_normalize(NULL, 'NFC'), + unicode_normalize('', 'NFC'); + """ + + sql "DROP TABLE IF EXISTS test_unicode_normalize_not_const" + + sql """ + CREATE TABLE test_unicode_normalize_not_const ( + id int, + col string, + mode string + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES("replication_allocation" = "tag.location.default: 1"); + """ + + sql """ + INSERT INTO test_unicode_normalize_not_const VALUES + (1, 'Abc', 'NFKC_CF'), + (2, 'Def', 'NFD'); + """ + + test { + sql """ + select unicode_normalize(col, mode) + from test_unicode_normalize_not_const; + """ + exception "must be constant" + } + + test { + sql """ + select unicode_normalize('abc', 'INVALID_MODE'); + """ + exception "Invalid normalization mode" + } +} \ No newline at end of file