Skip to content

Commit 80b37e7

Browse files
Fix DR baselines (openproblems-bio#816)
* fix DR baselines * add density test * fix dataset prep * bugfix * can't have >500 comps * ignore missing parametricumap * typo * account for arpack convergence
1 parent 5490687 commit 80b37e7

File tree

9 files changed

+137
-50
lines changed

9 files changed

+137
-50
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .mouse_blood_olsson_labelled import olsson_2016_mouse_blood
22
from .mouse_hspc_nestorowa2016 import mouse_hspc_nestorowa2016
33
from .tenx_5k_pbmc import tenx_5k_pbmc
4+
from .zebrafish import zebrafish_labs
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from ....data.zebrafish import load_zebrafish
2+
from ....tools.decorators import dataset
3+
from ....tools.normalize import log_cp10k
4+
5+
6+
@dataset(
7+
"Zebrafish",
8+
data_url=load_zebrafish.metadata["data_url"],
9+
data_reference=load_zebrafish.metadata["data_reference"],
10+
dataset_summary="90k cells from zebrafish embryos throughout the first day of "
11+
"development, with and without a knockout of chordin, an important developmental "
12+
"gene. Dimensions: 26022 cells, 25258 genes. 24 cell types "
13+
"(avg. 1084±1156 cells per cell type).",
14+
)
15+
def zebrafish_labs(test=False):
16+
adata = load_zebrafish(test=test)
17+
adata.uns["n_genes"] = adata.shape[1]
18+
return log_cp10k(adata)

openproblems/tasks/dimensionality_reduction/methods/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .baseline import random_features
2+
from .baseline import spectral_features
23
from .baseline import true_features
3-
from .baseline import true_features_log_cp10k
4-
from .baseline import true_features_log_cp10k_hvg
4+
from .diffusion_map import diffusion_map
55
from .neuralee import neuralee_default
66
from .neuralee import neuralee_logCP10k_1kHVG
77
from .pca import pca_logCP10k

openproblems/tasks/dimensionality_reduction/methods/baseline.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from ....tools.decorators import method
22
from ....tools.normalize import log_cp10k
3-
from ....tools.normalize import log_cp10k_hvg
43
from ....tools.utils import check_version
4+
from .diffusion_map import diffusion_map
5+
from typing import Optional
56

67
import functools
78
import numpy as np
@@ -29,19 +30,6 @@ def random_features(adata, test=False):
2930
method_name="True Features",
3031
)
3132
def true_features(adata, test=False):
32-
adata.obsm["X_emb"] = adata.X
33-
if test:
34-
adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100]
35-
36-
adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray()
37-
adata.uns["method_code_version"] = check_version("openproblems")
38-
return adata
39-
40-
41-
@_baseline_method(
42-
method_name="True Features (logCP10k)",
43-
)
44-
def true_features_log_cp10k(adata, test=False):
4533
adata = log_cp10k(adata)
4634
adata.obsm["X_emb"] = adata.X
4735
if test:
@@ -53,14 +41,15 @@ def true_features_log_cp10k(adata, test=False):
5341

5442

5543
@_baseline_method(
56-
method_name="True Features (logCP10k, 1kHVG)",
44+
method_name="Spectral Features",
5745
)
58-
def true_features_log_cp10k_hvg(adata, test=False):
59-
adata = log_cp10k_hvg(adata)
60-
adata.obsm["X_emb"] = adata[:, adata.var["highly_variable"]].copy().X
46+
def spectral_features(adata, test=False, n_comps: Optional[int] = None):
47+
6148
if test:
62-
adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100]
49+
n_comps = n_comps or 20
50+
else:
51+
n_comps = n_comps or 1000
6352

64-
adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray()
65-
adata.uns["method_code_version"] = check_version("openproblems")
66-
return adata
53+
n_comps = min(n_comps, min(adata.shape) - 2)
54+
55+
return diffusion_map(adata, n_comps=n_comps)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from ....tools.decorators import method
2+
from ....tools.normalize import log_cp10k
3+
from ....tools.utils import check_version
4+
5+
6+
def _diffusion_map(graph, n_comps, t, n_retries=1):
7+
import numpy as np
8+
import scipy.sparse.linalg
9+
10+
diag_data = np.asarray(graph.sum(axis=0))
11+
identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64)
12+
diag = scipy.sparse.spdiags(
13+
1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0]
14+
)
15+
laplacian = identity - diag * graph * diag
16+
num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0])))
17+
try:
18+
eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
19+
laplacian,
20+
n_comps,
21+
which="SM",
22+
ncv=num_lanczos_vectors,
23+
tol=1e-4,
24+
v0=np.ones(laplacian.shape[0]),
25+
maxiter=graph.shape[0] * 5,
26+
)
27+
return (eigenvalues**t) * eigenvectors
28+
except scipy.sparse.linalg.ArpackNoConvergence:
29+
if n_retries > 0:
30+
# add some noise and try again
31+
graph_rand = graph.copy().tocoo()
32+
graph_rand.row = np.random.choice(
33+
graph_rand.shape[0], len(graph_rand.row), replace=True
34+
)
35+
graph_rand.data *= 0.01
36+
return _diffusion_map(
37+
graph + graph_rand, n_comps, t, n_retries=n_retries - 1
38+
)
39+
else:
40+
raise
41+
42+
43+
@method(
44+
method_name="Diffusion maps",
45+
paper_reference="coifman2006diffusion",
46+
paper_name="Diffusion maps",
47+
paper_year=2006,
48+
code_url="https://github.com/openproblems-bio/openproblems",
49+
)
50+
def diffusion_map(
51+
adata, n_comps: int = 2, t: int = 1, test: bool = False, n_retries: int = 1
52+
):
53+
import umap
54+
55+
adata = log_cp10k(adata)
56+
57+
graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X)
58+
59+
adata.obsm["X_emb"] = _diffusion_map(graph, n_comps, t, n_retries=n_retries)
60+
adata.uns["method_code_version"] = check_version("openproblems")
61+
return adata

openproblems/tasks/dimensionality_reduction/metrics/density.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def _calculate_radii(
4747

4848
# directly taken from: https://github.com/lmcinnes/umap/blob/
4949
# 317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243
50-
(knn_indices, knn_dists, rp_forest,) = nearest_neighbors(
50+
knn_indices, knn_dists, _ = nearest_neighbors(
5151
X,
5252
n_neighbors,
5353
"euclidean",
@@ -57,7 +57,7 @@ def _calculate_radii(
5757
verbose=False,
5858
)
5959

60-
emb_graph, emb_sigmas, emb_rhos, emb_dists = fuzzy_simplicial_set(
60+
emb_graph, _, _, emb_dists = fuzzy_simplicial_set(
6161
X,
6262
n_neighbors,
6363
random_state,
@@ -100,21 +100,15 @@ def _calculate_radii(
100100
"density preservation",
101101
paper_reference="narayan2021assessing",
102102
maximize=True,
103-
image="openproblems-python-extras",
104103
)
105104
def density_preservation(adata: AnnData) -> float:
106105
from scipy.sparse import issparse
107106
from scipy.stats import pearsonr
108-
from umap import UMAP
109107

110108
emb = adata.obsm["X_emb"]
111-
if np.any(np.isnan(emb)):
112-
return 0.0
113109

114110
high_dim = adata.X.A if issparse(adata.X) else adata.X
115-
_, ro, _ = UMAP(
116-
n_neighbors=_K, random_state=_SEED, densmap=True, output_dens=True
117-
).fit_transform(high_dim)
111+
ro = _calculate_radii(high_dim, n_neighbors=_K, random_state=_SEED)
118112
# in principle, we could just call _calculate_radii(high_dim, ...)
119113
# this is made sure that the test pass (otherwise, there was .02 difference in corr)
120114
re = _calculate_radii(emb, n_neighbors=_K, random_state=_SEED)

openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from ....tools.decorators import metric
22
from ....tools.normalize import log_cp10k
3+
from ..methods.diffusion_map import diffusion_map
34

45

56
def _distance_correlation(X, X_emb):
@@ -18,7 +19,7 @@ def _distance_correlation(X, X_emb):
1819
maximize=True,
1920
paper_reference="schober2018correlation",
2021
)
21-
def distance_correlation(adata, n_svd=200):
22+
def distance_correlation(adata, n_svd=500):
2223
"""Calculate the root mean squared error.
2324
2425
Computes (RMSE) between the full (or processed) data matrix and the
@@ -37,23 +38,13 @@ def distance_correlation(adata, n_svd=200):
3738
maximize=True,
3839
paper_reference="coifman2006diffusion",
3940
)
40-
def distance_correlation_spectral(adata, n_comps=200):
41+
def distance_correlation_spectral(adata, n_comps=1000):
4142
"""Calculate the spectral root mean squared error
4243
4344
Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or
4445
processed) data matrix and the dimensionally-reduced matrix, invariant to scalar
4546
multiplication
4647
"""
47-
import numpy as np
48-
import umap
49-
import umap.spectral
50-
51-
adata = log_cp10k(adata)
52-
5348
n_comps = min(n_comps, min(adata.shape) - 2)
54-
55-
graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X)
56-
X = umap.spectral.spectral_layout(
57-
adata.X, graph, n_comps, random_state=np.random.default_rng()
58-
)
59-
return _distance_correlation(X, adata.obsm["X_emb"])
49+
adata_true = diffusion_map(adata.copy(), n_comps=n_comps)
50+
return _distance_correlation(adata_true.obsm["X_emb"], adata.obsm["X_emb"])

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ filterwarnings =
1111
ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning
1212
ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning
1313
ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning
14+
ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning
1415
always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning

test/test_task_dimensionality_reduction.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ def test_trustworthiness_sparse(): # pragma: nocover
2626
assert 0 <= m <= 1
2727

2828

29-
@utils.docker.docker_test(image=TASK.metrics.density_preservation.metadata["image"])
30-
def test_density_preservation_matches_densmap(): # pragma: nocover
29+
def test_density_preservation_matches_densmap():
3130
from openproblems.tasks.dimensionality_reduction.metrics.density import _K
3231
from openproblems.tasks.dimensionality_reduction.metrics.density import _SEED
3332
from scipy.stats import pearsonr
@@ -52,4 +51,37 @@ def test_density_preservation_matches_densmap(): # pragma: nocover
5251
adata.obsm["X_emb"] = emb
5352
actual = metric(adata)
5453

55-
np.testing.assert_allclose(expected, actual, rtol=1e-5)
54+
np.testing.assert_allclose(expected, actual, rtol=1e-3)
55+
56+
57+
def test_density_preservation_perfect():
58+
import numpy as np
59+
60+
task = openproblems.tasks.dimensionality_reduction
61+
metric = openproblems.tasks.dimensionality_reduction.metrics.density_preservation
62+
63+
adata = task.api.sample_dataset()
64+
adata = task.api.sample_method(adata)
65+
66+
adata.obsm["X_emb"] = adata.X.toarray()
67+
actual = metric(adata)
68+
69+
np.testing.assert_allclose(1, actual)
70+
71+
72+
def test_diffusion_map_no_convergence():
73+
import numpy as np
74+
import scipy.sparse.linalg
75+
76+
adata = (
77+
openproblems.tasks.dimensionality_reduction.datasets.olsson_2016_mouse_blood()
78+
)
79+
# no exception with retries
80+
adata = openproblems.tasks.dimensionality_reduction.methods.diffusion_map(adata)
81+
# exception with no retries
82+
np.testing.assert_raises(
83+
scipy.sparse.linalg.ArpackNoConvergence,
84+
openproblems.tasks.dimensionality_reduction.methods.diffusion_map,
85+
adata,
86+
n_retries=0,
87+
)

0 commit comments

Comments
 (0)