Skip to content

Commit 6e7fe21

Browse files
jpintarJura Pintarpre-commit-ci[bot]flying-sheep
authored
feat: add jaccard as an additional method in sc.pp.neighbors (#3831)
Co-authored-by: Jura Pintar <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Philipp A. <[email protected]>
1 parent 800c0b0 commit 6e7fe21

File tree

5 files changed

+101
-13
lines changed

5 files changed

+101
-13
lines changed

docs/release-notes/3831.feat.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add `method='jaccard'` for generating connectivities in {func}`scanpy.pp.neighbors` {smaller}`J Pintar`

src/scanpy/neighbors/__init__.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,10 @@ def neighbors( # noqa: PLR0913
8686
8787
The neighbor search efficiency of this heavily relies on UMAP :cite:p:`McInnes2018`,
8888
which also provides a method for estimating connectivities of data points -
89-
the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
90-
connectivities are computed according to :cite:t:`Coifman2005`, in the adaption of
91-
:cite:t:`Haghverdi2016`.
89+
the connectivity of the manifold (`method=='umap'`).
90+
If `method=='gauss'`, connectivities are computed according to :cite:t:`Coifman2005`,
91+
in the adaption of :cite:t:`Haghverdi2016`.
92+
If `method=='jaccard'`, connectivities are computed as in PhenoGraph :cite:p:`Levine2015`.
9293
9394
Parameters
9495
----------
@@ -112,8 +113,10 @@ def neighbors( # noqa: PLR0913
112113
Kernel to assign low weights to neighbors more distant than the
113114
`n_neighbors` nearest neighbor.
114115
method
115-
Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
116-
with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.
116+
Use 'umap' :cite:p:`McInnes2018`,
117+
'gauss' (Gauss kernel following :cite:t:`Coifman2005` with adaptive width :cite:t:`Haghverdi2016`),
118+
or 'jaccard' (Jaccard kernel as in PhenoGraph, :cite:t:`Levine2015`)
119+
for computing connectivities.
117120
transformer
118121
Approximate kNN search implementation following the API of
119122
:class:`~sklearn.neighbors.KNeighborsTransformer`.
@@ -607,6 +610,12 @@ def compute_neighbors( # noqa: PLR0912
607610
self._connectivities = _connectivity.gauss(
608611
self._distances, self.n_neighbors, knn=self.knn
609612
)
613+
elif method == "jaccard":
614+
self._connectivities = _connectivity.jaccard(
615+
knn_indices,
616+
n_obs=self._adata.shape[0],
617+
n_neighbors=self.n_neighbors,
618+
)
610619
elif method is not None:
611620
msg = f"{method!r} should have been coerced in _handle_transform_args"
612621
raise AssertionError(msg)
@@ -629,7 +638,7 @@ def _handle_transformer(
629638
) -> tuple[_Method | None, KnnTransformerLike, bool]:
630639
"""Return effective `method` and transformer.
631640
632-
`method` will be coerced to `'gauss'` or `'umap'`.
641+
`method` will be coerced to `'gauss'`, `'umap'`, or `'jaccard'`.
633642
`transformer` is coerced from a str or instance to an instance class.
634643
635644
If `transformer` is `None` and there are few data points,
@@ -648,7 +657,7 @@ def _handle_transformer(
648657
transformer is None and (use_dense_distances or self._adata.n_obs < 4096)
649658
)
650659

651-
# Coerce `method` to 'gauss' or 'umap'
660+
# Coerce `method` to 'gauss', 'umap', or 'jaccard'
652661
if method == "rapids":
653662
if transformer is not None:
654663
msg = "Can’t specify both `method = 'rapids'` and `transformer`."
@@ -662,7 +671,7 @@ def _handle_transformer(
662671
raise ValueError(msg)
663672

664673
# Validate `knn`
665-
conn_method = method if method in {"gauss", None} else "umap"
674+
conn_method = method if method in {"gauss", "jaccard", None} else "umap"
666675
if not knn and not (conn_method == "gauss" and transformer is None):
667676
# “knn=False” seems to be only intended for method “gauss”
668677
msg = f"`method = {method!r} only with `knn = True`."

src/scanpy/neighbors/_connectivity.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ._common import (
1111
_get_indices_distances_from_dense_matrix,
1212
_get_indices_distances_from_sparse_matrix,
13+
_get_sparse_matrix_from_indices_distances,
1314
)
1415

1516

@@ -135,3 +136,51 @@ def umap(
135136
)
136137

137138
return connectivities.tocsr()
139+
140+
141+
def jaccard(
142+
knn_indices: NDArray[np.int32 | np.int64],
143+
*,
144+
n_obs: int,
145+
n_neighbors: int,
146+
) -> CSRBase:
147+
"""Derive Jaccard connectivities between data points from kNN indices.
148+
149+
Re-implements the weighting method from Phenograph, :cite:p:`Levine2015`.
150+
151+
Parameters
152+
----------
153+
knn_indices
154+
The input matrix of nearest neighbor indices for each cell.
155+
n_obs
156+
Number of cells in the data-set.
157+
n_neighbors
158+
The number of nearest neighbors to consider.
159+
160+
"""
161+
# Construct unweighted kNN adjacency matrix (self excluded, as in PhenoGraph)
162+
adjacency = _get_sparse_matrix_from_indices_distances(
163+
knn_indices, np.ones_like(knn_indices), keep_self=False
164+
)
165+
166+
# Compute |N(i) ∩ N(j)|
167+
i_idx = np.repeat(np.arange(n_obs), n_neighbors - 1)
168+
j_idx = knn_indices[:, 1:].ravel()
169+
rows_i = adjacency[i_idx, :]
170+
rows_j = adjacency[j_idx, :]
171+
shared = np.asarray(rows_i.multiply(rows_j).sum(axis=1)).ravel()
172+
173+
# Jaccard index
174+
jaccard = shared / (2 * (n_neighbors - 1) - shared)
175+
176+
# Build connectivity matrix, filtering out zeros
177+
mask = jaccard != 0
178+
connectivities = sparse.csr_matrix( # noqa: TID251
179+
(jaccard[mask], (i_idx[mask], j_idx[mask])),
180+
shape=(n_obs, n_obs),
181+
)
182+
183+
# Symmetrize by averaging (as default in PhenoGraph)
184+
connectivities = (connectivities + connectivities.T) / 2
185+
186+
return connectivities

src/scanpy/neighbors/_types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
"_MetricSparseCapable",
2121
]
2222

23-
24-
type _Method = Literal["umap", "gauss"]
23+
type _Method = Literal["umap", "gauss", "jaccard"]
2524
type _KnownTransformer = Literal["pynndescent", "sklearn", "rapids"]
2625

2726
type _MetricFn = Callable[[np.ndarray, np.ndarray], float]

tests/test_neighbors.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,29 @@
122122
]
123123

124124

125+
# jaccard kernel – only knn results
126+
connectivities_jaccard = [
127+
[0.0, 0.3333333333333333, 0.0, 0.3333333333333333],
128+
[0.3333333333333333, 0.0, 0.16666666666666666, 0.3333333333333333],
129+
[0.0, 0.16666666666666666, 0.0, 0.16666666666666666],
130+
[0.3333333333333333, 0.3333333333333333, 0.16666666666666666, 0.0],
131+
]
132+
133+
transitions_sym_jaccard = [
134+
[0.0, 0.4225771273642583, 0.0, 0.4225771273642583],
135+
[0.4225771273642583, 0.0, 0.4225771273642583, 0.2857142857142857],
136+
[0.0, 0.4225771273642583, 0.0, 0.4225771273642583],
137+
[0.4225771273642583, 0.2857142857142857, 0.4225771273642583, 0.0],
138+
]
139+
140+
transitions_jaccard = [
141+
[0.0, 0.5, 0.0, 0.5],
142+
[0.35714285714285715, 0.0, 0.35714285714285715, 0.2857142857142857],
143+
[0.0, 0.5, 0.0, 0.5],
144+
[0.35714285714285715, 0.2857142857142857, 0.35714285714285715, 0.0],
145+
]
146+
147+
125148
def get_neighbors() -> Neighbors:
126149
return Neighbors(AnnData(np.array(X)))
127150

@@ -131,11 +154,11 @@ def neigh() -> Neighbors:
131154
return get_neighbors()
132155

133156

134-
@pytest.mark.parametrize("method", ["umap", "gauss"])
157+
@pytest.mark.parametrize("method", ["umap", "gauss", "jaccard"])
135158
def test_distances_euclidean(
136-
mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss"]
159+
mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss", "jaccard"]
137160
):
138-
"""Umap and gauss behave the same for distances.
161+
"""Umap, gauss, and jaccard behave the same for distances.
139162
140163
They call pynndescent for large data.
141164
"""
@@ -193,6 +216,13 @@ def test_distances_all(neigh: Neighbors, transformer, knn):
193216
transitions_sym_gauss_knn,
194217
id="gauss",
195218
),
219+
pytest.param(
220+
"jaccard",
221+
connectivities_jaccard,
222+
transitions_jaccard,
223+
transitions_sym_jaccard,
224+
id="jaccard",
225+
),
196226
],
197227
)
198228
def test_connectivities_euclidean(neigh: Neighbors, method, conn, trans, trans_sym):

0 commit comments

Comments
 (0)