Skip to content

Commit 19f2973

Browse files
Minimize concat memory usage (#10866)
* Minimize concat memory usage Closes #10864 ``` | Change | Before [b5e4b0e] <main> | After [c9432cfc] <min-concat-mem> | Ratio | Benchmark (Parameter) | |----------|----------------------------|-------------------------------------|---------|---------------------------------| | - | 4.82G | 920M | 0.19 | combine.Concat1d.peakmem_concat | | - | 574±20ms | 54.0±0.6ms | 0.09 | combine.Concat1d.time_concat | ``` * reduce bench size * try getting asv mamba to work * Revert "try getting asv mamba to work" This reverts commit f1dab89. * use conda for asv * Use rattler instead * address comments * add whats-new --------- Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent f1d7cc2 commit 19f2973

File tree

5 files changed

+43
-9
lines changed

5 files changed

+43
-9
lines changed

asv_bench/asv.conf.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
// If missing or the empty string, the tool will be automatically
3030
// determined by looking for tools on the PATH environment
3131
// variable.
32-
"environment_type": "mamba",
32+
"environment_type": "rattler",
3333
"conda_channels": ["conda-forge"],
3434

3535
// timeout in seconds for installing any dependencies in environment
@@ -76,7 +76,7 @@
7676
// https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185
7777
"build_command": [
7878
"python -m build",
79-
"python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
79+
"python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
8080
],
8181
// Combinations of libraries/python versions can be excluded/included
8282
// from the set to test. Each entry is a dictionary containing additional

asv_bench/benchmarks/combine.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,22 @@
55
from . import requires_dask
66

77

8+
class Concat1d:
9+
"""Benchmark concatenating large datasets"""
10+
11+
def setup(self) -> None:
12+
self.data_arrays = [
13+
xr.DataArray(data=np.zeros(4 * 1024 * 1024, dtype=np.int8), dims=["x"])
14+
for _ in range(10)
15+
]
16+
17+
def time_concat(self) -> None:
18+
xr.concat(self.data_arrays, dim="x")
19+
20+
def peakmem_concat(self) -> None:
21+
xr.concat(self.data_arrays, dim="x")
22+
23+
824
class Combine1d:
925
"""Benchmark concatenating and merging large datasets"""
1026

ci/requirements/environment-benchmark.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies:
1212
- numba
1313
- numbagg
1414
- numexpr
15+
- py-rattler
1516
- numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105
1617
- opt_einsum
1718
- packaging

doc/whats-new.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ Bug Fixes
4040
- Fix indexing with empty arrays for scipy & h5netcdf backends which now resolves to empty slices (:issue:`10867`, :pull:`10870`).
4141
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
4242

43+
Performance
44+
~~~~~~~~~~~
45+
46+
- Speedup and reduce memory usage of :py:func:`concat`. Magnitude of improvement scales
47+
with size of the concatenation dimension. By `Deepak Cherian <https://github.com/dcherian>`_.
48+
:issue:`10864` :pull:`10866`.
49+
4350
Documentation
4451
~~~~~~~~~~~~~
4552

xarray/structure/concat.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -745,10 +745,11 @@ def get_indexes(name):
745745
yield PandasIndex(data, dim_name, coord_dtype=var.dtype)
746746

747747
# create concatenation index, needed for later reindexing
748+
# use np.cumulative_sum(concat_dim_lengths, include_initial=True) when we support numpy>=2
748749
file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths))
749-
concat_index = np.arange(file_start_indexes[-1])
750-
concat_index_size = concat_index.size
750+
concat_index_size = file_start_indexes[-1]
751751
variable_index_mask = np.ones(concat_index_size, dtype=bool)
752+
variable_reindexer = None
752753

753754
# stack up each variable and/or index to fill-out the dataset (in order)
754755
# n.b. this loop preserves variable order, needed for groupby.
@@ -776,7 +777,6 @@ def get_indexes(name):
776777
end = file_start_indexes[i + 1]
777778
variable_index_mask[slice(start, end)] = False
778779

779-
variable_index = concat_index[variable_index_mask]
780780
vars = ensure_common_dims(variables, var_concat_dim_length)
781781

782782
# Try to concatenate the indexes, concatenate the variables when no index
@@ -807,12 +807,22 @@ def get_indexes(name):
807807
vars, dim_name, positions, combine_attrs=combine_attrs
808808
)
809809
# reindex if variable is not present in all datasets
810-
if len(variable_index) < concat_index_size:
810+
if not variable_index_mask.all():
811+
if variable_reindexer is None:
812+
# allocate only once
813+
variable_reindexer = np.empty(
814+
concat_index_size,
815+
# cannot use uint since we need -1 as a sentinel for reindexing
816+
dtype=np.min_scalar_type(-concat_index_size),
817+
)
818+
np.cumsum(variable_index_mask, out=variable_reindexer)
819+
# variable_index_mask is boolean, so the first element is 1.
820+
# offset by 1 to start at 0.
821+
variable_reindexer -= 1
822+
variable_reindexer[~variable_index_mask] = -1
811823
combined_var = reindex_variables(
812824
variables={name: combined_var},
813-
dim_pos_indexers={
814-
dim_name: pd.Index(variable_index).get_indexer(concat_index)
815-
},
825+
dim_pos_indexers={dim_name: variable_reindexer},
816826
fill_value=fill_value,
817827
)[name]
818828
result_vars[name] = combined_var

0 commit comments

Comments
 (0)