Cherry-pick release:1.23.0 PRs to rel-1.23.0 (microsoft#25985)

snnn · yuslepukhin · adrianlizarraga · web-flow · commit fa3f6e3625e4 · 2025-09-08T12:48:11.000-07:00
This PR cherry-picks the following PRs to the rel-1.23.0 branch: * microsoft#25938 * microsoft#25957 * microsoft#25960 * microsoft#25968 * microsoft#25971 --------- Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Adrian Lizarraga <adlizarraga@microsoft.com> Co-authored-by: Hariharan Seshadri <shariharan91@gmail.com>
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
@@ -78,8 +78,8 @@ jobs:
       run: |
         set -e -x
         BINARY_SIZE_THRESHOLD_ARGS=""
-        echo "Binary size threshold in bytes: 1306224"
-        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1306224"
+        echo "Binary size threshold in bytes: 1436672"
+        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"
 
         # Ensure ANDROID_NDK_HOME is available and get its real path
         if [ -z "$ANDROID_NDK_HOME" ]; then
diff --git a/include/onnxruntime/core/framework/ortmemoryinfo.h b/include/onnxruntime/core/framework/ortmemoryinfo.h
@@ -13,18 +13,14 @@ struct OrtMemoryInfo {
   OrtMemoryInfo() = default;  // to allow default construction of Tensor
 
   // use string for name, so we could have customized allocator in execution provider.
-  const char* name = nullptr;
+  std::string name;
   OrtMemType mem_type = OrtMemTypeDefault;
   OrtAllocatorType alloc_type = OrtInvalidAllocator;
   OrtDevice device;
 
-  constexpr OrtMemoryInfo(const char* name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(),
-                          OrtMemType mem_type_ = OrtMemTypeDefault)
-#if ((defined(__GNUC__) && __GNUC__ > 4) || defined(__clang__))
-      // this causes a spurious error in CentOS gcc 4.8 build so disable if GCC version < 5
-      __attribute__((nonnull))
-#endif
-      : name(name_),
+  OrtMemoryInfo(std::string name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(),
+                OrtMemType mem_type_ = OrtMemTypeDefault)
+      : name(std::move(name_)),
         mem_type(mem_type_),
         alloc_type(type_),
         device(device_) {
@@ -39,7 +35,7 @@ struct OrtMemoryInfo {
     if (device != other.device)
       return device < other.device;
 
-    return strcmp(name, other.name) < 0;
+    return name < other.name;
   }
 
   // This is to make OrtMemoryInfo a valid key in hash tables
@@ -68,7 +64,7 @@ inline bool operator==(const OrtMemoryInfo& left, const OrtMemoryInfo& other) {
   return left.mem_type == other.mem_type &&
          left.alloc_type == other.alloc_type &&
          left.device == other.device &&
-         strcmp(left.name, other.name) == 0;
+         left.name == other.name;
 }
 
 inline bool operator!=(const OrtMemoryInfo& lhs, const OrtMemoryInfo& rhs) { return !(lhs == rhs); }
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -6,6 +6,7 @@
 #include "core/common/safeint.h"
 #include "core/common/status.h"
 #include "core/framework/allocator.h"
+#include "core/framework/error_code_helper.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/framework/utils.h"
 #include "core/session/ort_apis.h"
@@ -185,22 +186,32 @@ std::ostream& operator<<(std::ostream& out, const OrtMemoryInfo& info) { return
 #endif
 ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1,
                     enum OrtMemType mem_type1, _Outptr_ OrtMemoryInfo** out) {
+  API_IMPL_BEGIN
+
+  if (name1 == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "MemoryInfo name cannot be null.");
+  }
+
+  if (out == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Output memory info cannot be null.");
+  }
+
   auto device_id = static_cast<OrtDevice::DeviceId>(id1);
   if (strcmp(name1, onnxruntime::CPU) == 0) {
     *out = new OrtMemoryInfo(onnxruntime::CPU, type, OrtDevice(), mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CUDA, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::OpenVINO_GPU) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::OpenVINO_GPU, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::INTEL, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::HIP, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 ||
@@ -212,45 +223,56 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
 
   } else if (strcmp(name1, onnxruntime::DML) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::DML, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::MICROSOFT, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::OpenVINO_RT_NPU) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::OpenVINO_RT_NPU, type,
         OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::INTEL, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CUDA_PINNED, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::NVIDIA, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP_PINNED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::HIP_PINNED, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::QNN_HTP_SHARED) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::QNN_HTP_SHARED, type,
         OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HOST_ACCESSIBLE, OrtDevice::VendorIds::QUALCOMM, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::CPU_ALIGNED_4K) == 0) {
     *out = new OrtMemoryInfo(
-        name1, type,
+        onnxruntime::CPU_ALIGNED_4K, type,
         OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id,
                   onnxruntime::kAlloc4KAlignment),
         mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported. Try CreateMemoryInfo_V2.");
   }
+  API_IMPL_END
   return nullptr;
 }
 
 ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo_V2, _In_ const char* name, _In_ enum OrtMemoryInfoDeviceType device_type,
                     _In_ uint32_t vendor_id, _In_ int32_t device_id, _In_ enum OrtDeviceMemoryType mem_type,
                     _In_ size_t alignment, enum OrtAllocatorType type,
                     _Outptr_ OrtMemoryInfo** out) {
+  API_IMPL_BEGIN
+
+  if (name == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "MemoryInfo name cannot be null.");
+  }
+
+  if (out == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Output memory info cannot be null.");
+  }
+
   // map the public enum values to internal OrtDevice values
   OrtDevice::MemoryType mt = mem_type == OrtDeviceMemoryType_DEFAULT ? OrtDevice::MemType::DEFAULT
                                                                      : OrtDevice::MemType::HOST_ACCESSIBLE;
@@ -275,6 +297,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo_V2, _In_ const char* name, _In_ en
 
   *out = new OrtMemoryInfo(name, type, OrtDevice{dt, mt, vendor_id, narrow<int16_t>(device_id), alignment},
                            mem_type == OrtDeviceMemoryType_DEFAULT ? OrtMemTypeDefault : OrtMemTypeCPU);
+  API_IMPL_END
   return nullptr;
 }
 
@@ -283,7 +306,7 @@ ORT_API(void, OrtApis::ReleaseMemoryInfo, _Frees_ptr_opt_ OrtMemoryInfo* p) { de
 #pragma warning(pop)
 #endif
 ORT_API_STATUS_IMPL(OrtApis::MemoryInfoGetName, _In_ const OrtMemoryInfo* ptr, _Out_ const char** out) {
-  *out = ptr->name;
+  *out = ptr->name.c_str();
   return nullptr;
 }
 
diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc
@@ -13,7 +13,7 @@ BFCArena::BFCArena(std::unique_ptr<IAllocator> resource_allocator,
                    int max_dead_bytes_per_chunk,
                    int initial_growth_chunk_size_bytes,
                    int64_t max_power_of_two_extend_bytes)
-    : IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
+    : IAllocator(OrtMemoryInfo(resource_allocator->Info().name.c_str(),
                                OrtAllocatorType::OrtArenaAllocator,
                                resource_allocator->Info().device,
                                resource_allocator->Info().mem_type)),
diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h
@@ -53,16 +53,25 @@ struct PackedQuantBDataStruct {
     {
         const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T);
-        if constexpr (BlkBitWidth == 8) {
-            PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);
-        } else {
 #if defined(MLAS_TARGET_AMD64_IX86)
         // avx512 requires alignment on a 64-byte boundary
         PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 64);
+#elif defined (MLAS_TARGET_ARM64)
+        // Only for 8-bit Gemms is the `PackedQuantBData` is to be 32-byte aligned and
+        // there is enough memory allocated to support this alignment.
+        // See QNBitGemmPackQuantBDataSize().
+        // When bit width is 4, there is no alignment guarantee.
+        // TODO(hasesh): Can we unify the alignment for 4-bit and 8-bit ARM64 Gemms so as to
+        // simpify this logic and make code here cleaner ?
+        if constexpr (BlkBitWidth == 8) {
+            PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);        
+        }
+        else {
+            PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
+        }
 #else
         PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
 #endif
-        }
 
         QuantBBlkSum = (T*)(PackedQuantBData + PackedQuantBDataSize);
         QuantBBlkSum = (T*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -927,7 +927,7 @@ namespace Dml
 
     bool IsGpuTensor(const onnxruntime::Tensor& tensor)
     {
-        return strcmp(tensor.Location().name, onnxruntime::CPU) &&
+        return strcmp(tensor.Location().name.c_str(), onnxruntime::CPU) &&
             !(tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput);
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -98,7 +98,7 @@ namespace Windows::AI::MachineLearning::Adapter
 
     bool IsAllocationInterface(const ::OrtMemoryInfo& info)
     {
-        return strcmp(info.name, onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);
+        return strcmp(info.name.c_str(), onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput || info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);
     }
 
     // Translate the data object stored in a tensor to the type which will be returned through
@@ -1774,7 +1774,9 @@ namespace Windows::AI::MachineLearning::Adapter
         }
 
         // tells caller whether this tensor is in CPU memory
-        return !strcmp(m_impl->Location().name, onnxruntime::CPU) || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;
+        return !strcmp(m_impl->Location().name.c_str(), onnxruntime::CPU) 
+            || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput
+            || m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;
     }
 
     bool STDMETHODCALLTYPE TensorWrapper::IsDataInterface() const noexcept
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -219,9 +219,11 @@ struct QnnEpFactory : OrtEpFactory {
         OrtKeyValuePairs* ep_options = nullptr;
         factory->ort_api.CreateKeyValuePairs(&ep_options);
         factory->ort_api.AddKeyValuePair(ep_options, "backend_path", factory->qnn_backend_path.c_str());
-        ORT_API_RETURN_IF_ERROR(
-            factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,
-                                                        &ep_devices[num_ep_devices++]));
+        OrtStatus* status = factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,
+                                                                        &ep_devices[num_ep_devices++]);
+
+        factory->ort_api.ReleaseKeyValuePairs(ep_options);
+        ORT_API_RETURN_IF_ERROR(status);
       }
     }
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -210,7 +210,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
                   return tensor != nullptr &&
                          tensor->Location().mem_type == OrtMemType::OrtMemTypeDefault &&
                          tensor->Location().device.Type() == OrtDevice::GPU &&
-                         !strcmp(tensor->Location().name, WEBGPU_BUFFER);
+                         !strcmp(tensor->Location().name.c_str(), WEBGPU_BUFFER);
                 }),
                 "All inputs must be tensors on WebGPU buffers.");
 
@@ -219,7 +219,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
                   return tensor != nullptr &&
                          tensor->Location().mem_type == OrtMemType::OrtMemTypeDefault &&
                          tensor->Location().device.Type() == OrtDevice::GPU &&
-                         !strcmp(tensor->Location().name, WEBGPU_BUFFER);
+                         !strcmp(tensor->Location().name.c_str(), WEBGPU_BUFFER);
                 }),
                 "All outputs must be tensors on WebGPU buffers.");
   }
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
@@ -79,7 +79,7 @@ static bool AreOrtMemoryInfosEquivalent(
     bool ignore_alignment = false) {
   return left.mem_type == right.mem_type &&
          (ignore_alignment ? left.device.EqualIgnoringAlignment(right.device) : left.device == right.device) &&
-         (!match_name || strcmp(left.name, right.name) == 0);
+         (!match_name || left.name == right.name);
 }
 
 std::vector<AllocatorPtr>::const_iterator FindExistingAllocator(const std::vector<AllocatorPtr>& allocators,
diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
@@ -53,11 +53,11 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
 static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
   std::unique_ptr<IDataTransfer> data_transfer;
 
-  if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
+  if (mem_info.name == onnxruntime::CPU) {
     return data_transfer;
   }
 
-  if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
+  if (mem_info.name == onnxruntime::CUDA) {
 #if defined(USE_CUDA) || defined(USE_CUDA_PROVIDER_INTERFACE)
     auto* cuda_provider_info = TryGetProviderInfo_CUDA();
     if (cuda_provider_info != nullptr) {
diff --git a/onnxruntime/test/framework/TestAllocatorManager.cc b/onnxruntime/test/framework/TestAllocatorManager.cc
@@ -10,7 +10,7 @@ namespace test {
 class DummyArena : public IAllocator {
  public:
   explicit DummyArena(std::unique_ptr<IAllocator> resource_allocator)
-      : IAllocator(OrtMemoryInfo(resource_allocator->Info().name,
+      : IAllocator(OrtMemoryInfo(resource_allocator->Info().name.c_str(),
                                  OrtAllocatorType::OrtDeviceAllocator,
                                  resource_allocator->Info().device,
                                  resource_allocator->Info().mem_type)),
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
@@ -13,7 +13,7 @@ namespace test {
 TEST(AllocatorTest, CPUAllocatorTest) {
   auto cpu_arena = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
 
-  ASSERT_STREQ(cpu_arena->Info().name, CPU);
+  ASSERT_STREQ(cpu_arena->Info().name.c_str(), CPU);
   EXPECT_EQ(cpu_arena->Info().device.Id(), 0);
 
   const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc
@@ -29,7 +29,7 @@ void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
   EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
   EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<T>());
   auto& location = t.Location();
-  EXPECT_STREQ(location.name, CPU);
+  EXPECT_STREQ(location.name.c_str(), CPU);
   EXPECT_EQ(location.device.Id(), 0);
 
   const T* t_data = t.Data<T>();
@@ -47,7 +47,7 @@ void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
     EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
     EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
     auto& new_location = new_t.Location();
-    ASSERT_STREQ(new_location.name, CPU);
+    ASSERT_STREQ(new_location.name.c_str(), CPU);
     EXPECT_EQ(new_location.device.Id(), 0);
   }
 }
@@ -135,7 +135,7 @@ TEST(TensorTest, EmptyTensorTest) {
   EXPECT_TRUE(!data);
 
   auto& location = t.Location();
-  ASSERT_STREQ(location.name, CPU);
+  ASSERT_STREQ(location.name.c_str(), CPU);
   EXPECT_EQ(location.device.Id(), 0);
 
   const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
@@ -160,7 +160,7 @@ TEST(TensorTest, StringTensorTest) {
     EXPECT_EQ(shape, tensor_shape);
     EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<std::string>());
     auto& location = t.Location();
-    ASSERT_STREQ(location.name, CPU);
+    ASSERT_EQ(location.name, CPU);
     EXPECT_EQ(location.device.Id(), 0);
 
     std::string* new_data = t.MutableData<std::string>();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
@@ -216,7 +216,7 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
   for (; begin != end; ++begin) {
     const auto& [_, param] = *begin;
     const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
-    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::CUDA));
+    ASSERT_EQ(0, strcmp(tensor_device.Location().name.c_str(), onnxruntime::CUDA));
 
     const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
     ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
diff --git a/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sq8bitgemm.cpp
@@ -773,7 +773,8 @@ class MlasSQ8BitGemmKernelTest : public MlasTestBase {
         N, K, 8, BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE::SQNBIT_CompInt8, nullptr, packedBuffer,
         nullptr, HasZp, inputZp, nullptr);
 
-    PackedQuantBDataStruct<float, 8> packedQuantB(packedBuffer, N, BlkCount, BlkLen, true);
+    const bool isQuantAUnsigned = GetMlasPlatform().ArmNeonIsQuantActivationsUnsigned;
+    PackedQuantBDataStruct<float, 8> packedQuantB(packedBuffer, N, BlkCount, BlkLen, isQuantAUnsigned);
 
     auto* C = C_.GetBuffer(M * ldc, true);
     auto* ref = ref_.GetBuffer(M * ldc, true);
@@ -825,7 +826,9 @@ class MlasSQ8BitGemmKernelTest : public MlasTestBase {
 
   void ExecuteShort(void) override {
     Execute<1, 16, 1, 16>();
+    Execute<1, 1, 1, 16>();
     Execute<7, 2, 4, 16>();
+    Execute<7, 128, 4, 16>();
     Execute<8, 497, 5, 16>();
     Execute<1, 3072, 128, 16>();
     Execute<2, 3072, 128, 16>();
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
diff --git a/tools/python/run_packaging_pipelines.py b/tools/python/run_packaging_pipelines.py

Original file line number	Diff line number	Diff line change
`@@ -927,7 +927,7 @@ namespace Dml`
`927`	`927`
`928`	`928`	`bool IsGpuTensor(const onnxruntime::Tensor& tensor)`
`929`	`929`	`{`
`930`		`- return strcmp(tensor.Location().name, onnxruntime::CPU) &&`
	`930`	`+ return strcmp(tensor.Location().name.c_str(), onnxruntime::CPU) &&`
`931`	`931`	`!(tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput \|\| tensor.Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput);`
`932`	`932`	`}`
`933`	`933`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ namespace Windows::AI::MachineLearning::Adapter`
`98`	`98`
`99`	`99`	`bool IsAllocationInterface(const ::OrtMemoryInfo& info)`
`100`	`100`	`{`
`101`		`- return strcmp(info.name, onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput \|\| info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);`
	`101`	`+ return strcmp(info.name.c_str(), onnxruntime::CPU) && !(info.mem_type == ::OrtMemType::OrtMemTypeCPUOutput \|\| info.mem_type == ::OrtMemType::OrtMemTypeCPUInput);`
`102`	`102`	`}`
`103`	`103`
`104`	`104`	`// Translate the data object stored in a tensor to the type which will be returned through`
`@@ -1774,7 +1774,9 @@ namespace Windows::AI::MachineLearning::Adapter`
`1774`	`1774`	`}`
`1775`	`1775`
`1776`	`1776`	`// tells caller whether this tensor is in CPU memory`
`1777`		`- return !strcmp(m_impl->Location().name, onnxruntime::CPU) \|\| m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput \|\| m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;`
	`1777`	`+ return !strcmp(m_impl->Location().name.c_str(), onnxruntime::CPU)`
	`1778`	`+ \|\| m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput`
	`1779`	`+ \|\| m_impl->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput;`
`1778`	`1780`	`}`
`1779`	`1781`
`1780`	`1782`	`bool STDMETHODCALLTYPE TensorWrapper::IsDataInterface() const noexcept`
Original file line number	Diff line number	Diff line change
`@@ -219,9 +219,11 @@ struct QnnEpFactory : OrtEpFactory {`
`219`	`219`	`OrtKeyValuePairs* ep_options = nullptr;`
`220`	`220`	`factory->ort_api.CreateKeyValuePairs(&ep_options);`
`221`	`221`	`factory->ort_api.AddKeyValuePair(ep_options, "backend_path", factory->qnn_backend_path.c_str());`
`222`		`- ORT_API_RETURN_IF_ERROR(`
`223`		`- factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,`
`224`		`- &ep_devices[num_ep_devices++]));`
	`222`	`+ OrtStatus* status = factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,`
	`223`	`+ &ep_devices[num_ep_devices++]);`
	`224`	`+`
	`225`	`+ factory->ort_api.ReleaseKeyValuePairs(ep_options);`
	`226`	`+ ORT_API_RETURN_IF_ERROR(status);`
`225`	`227`	`}`
`226`	`228`	`}`
`227`	`229`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ static bool AreOrtMemoryInfosEquivalent(`
`79`	`79`	`bool ignore_alignment = false) {`
`80`	`80`	`return left.mem_type == right.mem_type &&`
`81`	`81`	`(ignore_alignment ? left.device.EqualIgnoringAlignment(right.device) : left.device == right.device) &&`
`82`		`- (!match_name \|\| strcmp(left.name, right.name) == 0);`
	`82`	`+ (!match_name \|\| left.name == right.name);`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`std::vector<AllocatorPtr>::const_iterator FindExistingAllocator(const std::vector<AllocatorPtr>& allocators,`