Fix DR baselines (openproblems-bio#816)

scottgigante-immunai · web-flow · commit 80b37e7a6aa2 · 2023-02-08T10:38:22.000-05:00
* fix DR baselines

* add density test

* fix dataset prep

* bugfix

* can't have &gt;500 comps

* ignore missing parametricumap

* typo

* account for arpack convergence
diff --git a/openproblems/tasks/dimensionality_reduction/datasets/__init__.py b/openproblems/tasks/dimensionality_reduction/datasets/__init__.py
@@ -1,3 +1,4 @@
 from .mouse_blood_olsson_labelled import olsson_2016_mouse_blood
 from .mouse_hspc_nestorowa2016 import mouse_hspc_nestorowa2016
 from .tenx_5k_pbmc import tenx_5k_pbmc
+from .zebrafish import zebrafish_labs
diff --git a/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py b/openproblems/tasks/dimensionality_reduction/datasets/zebrafish.py
@@ -0,0 +1,18 @@
+from ....data.zebrafish import load_zebrafish
+from ....tools.decorators import dataset
+from ....tools.normalize import log_cp10k
+
+
+@dataset(
+    "Zebrafish",
+    data_url=load_zebrafish.metadata["data_url"],
+    data_reference=load_zebrafish.metadata["data_reference"],
+    dataset_summary="90k cells from zebrafish embryos throughout the first day of "
+    "development, with and without a knockout of chordin, an important developmental "
+    "gene. Dimensions: 26022 cells, 25258 genes. 24 cell types "
+    "(avg. 1084±1156 cells per cell type).",
+)
+def zebrafish_labs(test=False):
+    adata = load_zebrafish(test=test)
+    adata.uns["n_genes"] = adata.shape[1]
+    return log_cp10k(adata)
diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py
@@ -1,7 +1,7 @@
 from .baseline import random_features
+from .baseline import spectral_features
 from .baseline import true_features
-from .baseline import true_features_log_cp10k
-from .baseline import true_features_log_cp10k_hvg
+from .diffusion_map import diffusion_map
 from .neuralee import neuralee_default
 from .neuralee import neuralee_logCP10k_1kHVG
 from .pca import pca_logCP10k
diff --git a/openproblems/tasks/dimensionality_reduction/methods/baseline.py b/openproblems/tasks/dimensionality_reduction/methods/baseline.py
@@ -1,7 +1,8 @@
 from ....tools.decorators import method
 from ....tools.normalize import log_cp10k
-from ....tools.normalize import log_cp10k_hvg
 from ....tools.utils import check_version
+from .diffusion_map import diffusion_map
+from typing import Optional
 
 import functools
 import numpy as np
@@ -29,19 +30,6 @@ def random_features(adata, test=False):
     method_name="True Features",
 )
 def true_features(adata, test=False):
-    adata.obsm["X_emb"] = adata.X
-    if test:
-        adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100]
-
-    adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray()
-    adata.uns["method_code_version"] = check_version("openproblems")
-    return adata
-
-
-@_baseline_method(
-    method_name="True Features (logCP10k)",
-)
-def true_features_log_cp10k(adata, test=False):
     adata = log_cp10k(adata)
     adata.obsm["X_emb"] = adata.X
     if test:
@@ -53,14 +41,15 @@ def true_features_log_cp10k(adata, test=False):
 
 
 @_baseline_method(
-    method_name="True Features (logCP10k, 1kHVG)",
+    method_name="Spectral Features",
 )
-def true_features_log_cp10k_hvg(adata, test=False):
-    adata = log_cp10k_hvg(adata)
-    adata.obsm["X_emb"] = adata[:, adata.var["highly_variable"]].copy().X
+def spectral_features(adata, test=False, n_comps: Optional[int] = None):
+
     if test:
-        adata.obsm["X_emb"] = adata.obsm["X_emb"][:, :100]
+        n_comps = n_comps or 20
+    else:
+        n_comps = n_comps or 1000
 
-    adata.obsm["X_emb"] = adata.obsm["X_emb"].toarray()
-    adata.uns["method_code_version"] = check_version("openproblems")
-    return adata
+    n_comps = min(n_comps, min(adata.shape) - 2)
+
+    return diffusion_map(adata, n_comps=n_comps)
diff --git a/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py b/openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py
@@ -0,0 +1,61 @@
+from ....tools.decorators import method
+from ....tools.normalize import log_cp10k
+from ....tools.utils import check_version
+
+
+def _diffusion_map(graph, n_comps, t, n_retries=1):
+    import numpy as np
+    import scipy.sparse.linalg
+
+    diag_data = np.asarray(graph.sum(axis=0))
+    identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64)
+    diag = scipy.sparse.spdiags(
+        1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0]
+    )
+    laplacian = identity - diag * graph * diag
+    num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0])))
+    try:
+        eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
+            laplacian,
+            n_comps,
+            which="SM",
+            ncv=num_lanczos_vectors,
+            tol=1e-4,
+            v0=np.ones(laplacian.shape[0]),
+            maxiter=graph.shape[0] * 5,
+        )
+        return (eigenvalues**t) * eigenvectors
+    except scipy.sparse.linalg.ArpackNoConvergence:
+        if n_retries > 0:
+            # add some noise and try again
+            graph_rand = graph.copy().tocoo()
+            graph_rand.row = np.random.choice(
+                graph_rand.shape[0], len(graph_rand.row), replace=True
+            )
+            graph_rand.data *= 0.01
+            return _diffusion_map(
+                graph + graph_rand, n_comps, t, n_retries=n_retries - 1
+            )
+        else:
+            raise
+
+
+@method(
+    method_name="Diffusion maps",
+    paper_reference="coifman2006diffusion",
+    paper_name="Diffusion maps",
+    paper_year=2006,
+    code_url="https://github.com/openproblems-bio/openproblems",
+)
+def diffusion_map(
+    adata, n_comps: int = 2, t: int = 1, test: bool = False, n_retries: int = 1
+):
+    import umap
+
+    adata = log_cp10k(adata)
+
+    graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X)
+
+    adata.obsm["X_emb"] = _diffusion_map(graph, n_comps, t, n_retries=n_retries)
+    adata.uns["method_code_version"] = check_version("openproblems")
+    return adata
diff --git a/openproblems/tasks/dimensionality_reduction/metrics/density.py b/openproblems/tasks/dimensionality_reduction/metrics/density.py
@@ -47,7 +47,7 @@ def _calculate_radii(
 
     # directly taken from: https://github.com/lmcinnes/umap/blob/
     # 317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243
-    (knn_indices, knn_dists, rp_forest,) = nearest_neighbors(
+    knn_indices, knn_dists, _ = nearest_neighbors(
         X,
         n_neighbors,
         "euclidean",
@@ -57,7 +57,7 @@ def _calculate_radii(
         verbose=False,
     )
 
-    emb_graph, emb_sigmas, emb_rhos, emb_dists = fuzzy_simplicial_set(
+    emb_graph, _, _, emb_dists = fuzzy_simplicial_set(
         X,
         n_neighbors,
         random_state,
@@ -100,21 +100,15 @@ def _calculate_radii(
     "density preservation",
     paper_reference="narayan2021assessing",
     maximize=True,
-    image="openproblems-python-extras",
 )
 def density_preservation(adata: AnnData) -> float:
     from scipy.sparse import issparse
     from scipy.stats import pearsonr
-    from umap import UMAP
 
     emb = adata.obsm["X_emb"]
-    if np.any(np.isnan(emb)):
-        return 0.0
 
     high_dim = adata.X.A if issparse(adata.X) else adata.X
-    _, ro, _ = UMAP(
-        n_neighbors=_K, random_state=_SEED, densmap=True, output_dens=True
-    ).fit_transform(high_dim)
+    ro = _calculate_radii(high_dim, n_neighbors=_K, random_state=_SEED)
     # in principle, we could just call _calculate_radii(high_dim, ...)
     # this is made sure that the test pass (otherwise, there was .02 difference in corr)
     re = _calculate_radii(emb, n_neighbors=_K, random_state=_SEED)
diff --git a/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py b/openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py
@@ -1,5 +1,6 @@
 from ....tools.decorators import metric
 from ....tools.normalize import log_cp10k
+from ..methods.diffusion_map import diffusion_map
 
 
 def _distance_correlation(X, X_emb):
@@ -18,7 +19,7 @@ def _distance_correlation(X, X_emb):
     maximize=True,
     paper_reference="schober2018correlation",
 )
-def distance_correlation(adata, n_svd=200):
+def distance_correlation(adata, n_svd=500):
     """Calculate the root mean squared error.
 
     Computes (RMSE) between the full (or processed) data matrix and the
@@ -37,23 +38,13 @@ def distance_correlation(adata, n_svd=200):
     maximize=True,
     paper_reference="coifman2006diffusion",
 )
-def distance_correlation_spectral(adata, n_comps=200):
+def distance_correlation_spectral(adata, n_comps=1000):
     """Calculate the spectral root mean squared error
 
     Computes (RMSE) between high-dimensional Laplacian eigenmaps on the full (or
     processed) data matrix and the dimensionally-reduced matrix, invariant to scalar
     multiplication
     """
-    import numpy as np
-    import umap
-    import umap.spectral
-
-    adata = log_cp10k(adata)
-
     n_comps = min(n_comps, min(adata.shape) - 2)
-
-    graph = umap.UMAP(transform_mode="graph").fit_transform(adata.X)
-    X = umap.spectral.spectral_layout(
-        adata.X, graph, n_comps, random_state=np.random.default_rng()
-    )
-    return _distance_correlation(X, adata.obsm["X_emb"])
+    adata_true = diffusion_map(adata.copy(), n_comps=n_comps)
+    return _distance_correlation(adata_true.obsm["X_emb"], adata.obsm["X_emb"])
diff --git a/pytest.ini b/pytest.ini
@@ -11,4 +11,5 @@ filterwarnings =
     ignore:is_categorical is deprecated and will be removed in a future version:FutureWarning
     ignore:The use of (converter|py2rpy|rpy2py) in module rpy2.robjects.conversion is deprecated.:DeprecationWarning
     ignore:`Model\.state_updates` will be removed in a future version\.:UserWarning
+    ignore:Tensorflow not installed. ParametricUMAP will be unavailable:ImportWarning
     always:Container failed with AssertionError\. Retrying [0-9]* more time:RuntimeWarning
diff --git a/test/test_task_dimensionality_reduction.py b/test/test_task_dimensionality_reduction.py
@@ -26,8 +26,7 @@ def test_trustworthiness_sparse():  # pragma: nocover
     assert 0 <= m <= 1
 
 
-@utils.docker.docker_test(image=TASK.metrics.density_preservation.metadata["image"])
-def test_density_preservation_matches_densmap():  # pragma: nocover
+def test_density_preservation_matches_densmap():
     from openproblems.tasks.dimensionality_reduction.metrics.density import _K
     from openproblems.tasks.dimensionality_reduction.metrics.density import _SEED
     from scipy.stats import pearsonr
@@ -52,4 +51,37 @@ def test_density_preservation_matches_densmap():  # pragma: nocover
     adata.obsm["X_emb"] = emb
     actual = metric(adata)
 
-    np.testing.assert_allclose(expected, actual, rtol=1e-5)
+    np.testing.assert_allclose(expected, actual, rtol=1e-3)
+
+
+def test_density_preservation_perfect():
+    import numpy as np
+
+    task = openproblems.tasks.dimensionality_reduction
+    metric = openproblems.tasks.dimensionality_reduction.metrics.density_preservation
+
+    adata = task.api.sample_dataset()
+    adata = task.api.sample_method(adata)
+
+    adata.obsm["X_emb"] = adata.X.toarray()
+    actual = metric(adata)
+
+    np.testing.assert_allclose(1, actual)
+
+
+def test_diffusion_map_no_convergence():
+    import numpy as np
+    import scipy.sparse.linalg
+
+    adata = (
+        openproblems.tasks.dimensionality_reduction.datasets.olsson_2016_mouse_blood()
+    )
+    # no exception with retries
+    adata = openproblems.tasks.dimensionality_reduction.methods.diffusion_map(adata)
+    # exception with no retries
+    np.testing.assert_raises(
+        scipy.sparse.linalg.ArpackNoConvergence,
+        openproblems.tasks.dimensionality_reduction.methods.diffusion_map,
+        adata,
+        n_retries=0,
+    )