Minimize concat memory usage (#10866)

dcherian · kmuehlbauer · web-flow · commit 19f2973528df · 2025-10-22T06:21:35.000Z
* Minimize concat memory usage Closes #10864 ``` | Change | Before [b5e4b0e] <main> | After [c9432cfc] <min-concat-mem> | Ratio | Benchmark (Parameter) | |----------|----------------------------|-------------------------------------|---------|---------------------------------| | - | 4.82G | 920M | 0.19 | combine.Concat1d.peakmem_concat | | - | 574±20ms | 54.0±0.6ms | 0.09 | combine.Concat1d.time_concat | ``` * reduce bench size * try getting asv mamba to work * Revert "try getting asv mamba to work" This reverts commit f1dab89. * use conda for asv * Use rattler instead * address comments * add whats-new --------- Co-authored-by: Kai Mühlbauer <kmuehlbauer@wradlib.org>
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -29,7 +29,7 @@
   // If missing or the empty string, the tool will be automatically
   // determined by looking for tools on the PATH environment
   // variable.
-  "environment_type": "mamba",
+  "environment_type": "rattler",
   "conda_channels": ["conda-forge"],
 
   // timeout in seconds for installing any dependencies in environment
@@ -76,7 +76,7 @@
   // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185
   "build_command": [
     "python -m build",
-    "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
+    "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
   ],
   // Combinations of libraries/python versions can be excluded/included
   // from the set to test. Each entry is a dictionary containing additional
diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
@@ -5,6 +5,22 @@
 from . import requires_dask
 
 
+class Concat1d:
+    """Benchmark concatenating large datasets"""
+
+    def setup(self) -> None:
+        self.data_arrays = [
+            xr.DataArray(data=np.zeros(4 * 1024 * 1024, dtype=np.int8), dims=["x"])
+            for _ in range(10)
+        ]
+
+    def time_concat(self) -> None:
+        xr.concat(self.data_arrays, dim="x")
+
+    def peakmem_concat(self) -> None:
+        xr.concat(self.data_arrays, dim="x")
+
+
 class Combine1d:
     """Benchmark concatenating and merging large datasets"""
 
diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml
@@ -12,6 +12,7 @@ dependencies:
   - numba
   - numbagg
   - numexpr
+  - py-rattler
   - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105
   - opt_einsum
   - packaging
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -40,6 +40,13 @@ Bug Fixes
 - Fix indexing with empty arrays for scipy & h5netcdf backends which now resolves to empty slices (:issue:`10867`, :pull:`10870`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
 
+Performance
+~~~~~~~~~~~
+
+- Speedup and reduce memory usage of :py:func:`concat`. Magnitude of improvement scales
+  with size of the concatenation dimension. By `Deepak Cherian <https://github.com/dcherian>`_.
+  :issue:`10864` :pull:`10866`.
+
 Documentation
 ~~~~~~~~~~~~~
 
diff --git a/xarray/structure/concat.py b/xarray/structure/concat.py
@@ -745,10 +745,11 @@ def get_indexes(name):
                         yield PandasIndex(data, dim_name, coord_dtype=var.dtype)
 
     # create concatenation index, needed for later reindexing
+    # use np.cumulative_sum(concat_dim_lengths, include_initial=True) when we support numpy>=2
     file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths))
-    concat_index = np.arange(file_start_indexes[-1])
-    concat_index_size = concat_index.size
+    concat_index_size = file_start_indexes[-1]
     variable_index_mask = np.ones(concat_index_size, dtype=bool)
+    variable_reindexer = None
 
     # stack up each variable and/or index to fill-out the dataset (in order)
     # n.b. this loop preserves variable order, needed for groupby.
@@ -776,7 +777,6 @@ def get_indexes(name):
                     end = file_start_indexes[i + 1]
                     variable_index_mask[slice(start, end)] = False
 
-            variable_index = concat_index[variable_index_mask]
             vars = ensure_common_dims(variables, var_concat_dim_length)
 
             # Try to concatenate the indexes, concatenate the variables when no index
@@ -807,12 +807,22 @@ def get_indexes(name):
                     vars, dim_name, positions, combine_attrs=combine_attrs
                 )
                 # reindex if variable is not present in all datasets
-                if len(variable_index) < concat_index_size:
+                if not variable_index_mask.all():
+                    if variable_reindexer is None:
+                        # allocate only once
+                        variable_reindexer = np.empty(
+                            concat_index_size,
+                            # cannot use uint since we need -1 as a sentinel for reindexing
+                            dtype=np.min_scalar_type(-concat_index_size),
+                        )
+                    np.cumsum(variable_index_mask, out=variable_reindexer)
+                    # variable_index_mask is boolean, so the first element is 1.
+                    # offset by 1 to start at 0.
+                    variable_reindexer -= 1
+                    variable_reindexer[~variable_index_mask] = -1
                     combined_var = reindex_variables(
                         variables={name: combined_var},
-                        dim_pos_indexers={
-                            dim_name: pd.Index(variable_index).get_indexer(concat_index)
-                        },
+                        dim_pos_indexers={dim_name: variable_reindexer},
                         fill_value=fill_value,
                     )[name]
                 result_vars[name] = combined_var