Skip to content

Commit 5c48e06

Browse files
committed
Add session parameter to CrossRef and S2 to mock HTTP requests
1 parent 73389b7 commit 5c48e06

26 files changed

+176
-84
lines changed

litstudy/sources/crossref.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,11 @@ def load(doi):
143143
CROSSREF_URL = "https://api.crossref.org/works/"
144144

145145

146-
def fetch_crossref(doi: str, timeout=0.5) -> Optional[Document]:
146+
def fetch_crossref(doi: str, *, timeout=0.5, session=None) -> Optional[Document]:
147147
"""Fetch the metadata for the given DOI from CrossRef.
148148
149+
:param timeout: The timeout between each HTTP request in seconds.
150+
:param session: The `requests.Session` to use for HTTP requests.
149151
:returns: The `Document` or `None` if the DOI was not available.
150152
"""
151153

@@ -160,7 +162,7 @@ def request():
160162
url = CROSSREF_URL + quote_plus(doi)
161163

162164
try:
163-
response = requests.get(url)
165+
response = session.get(url)
164166
except Exception as e:
165167
logging.warn(f"failed to retrieve {doi}: {e}")
166168
return None
@@ -188,25 +190,27 @@ def request():
188190
return CrossRefDocument(data) if data else None
189191

190192

191-
def refine_crossref(docs: DocumentSet, timeout=0.5) -> Tuple[DocumentSet, DocumentSet]:
193+
def refine_crossref(
194+
docs: DocumentSet, *, timeout=0.5, session=None
195+
) -> Tuple[DocumentSet, DocumentSet]:
192196
"""Attempts to fetch metadata from CrossRef for each document in the given
193197
set. Returns a tuple of two sets: the documents retrieved from CrossRef
194198
and the remaining documents (i.e., without DOI or not found).
195199
196-
:param timeout: Timeout in seconds between each request to throttle
197-
server communication.
200+
:param timeout: Timeout in seconds between each request to throttle server communication.
201+
:param session: The `requests.Session` to use for HTTP requests.
198202
"""
199203

200204
def callback(doc):
201205
if isinstance(doc, CrossRefDocument):
202206
return doc
203207

204-
return fetch_crossref(doc.id.doi, timeout)
208+
return fetch_crossref(doc.id.doi, timeout=timeout, session=session)
205209

206210
return docs._refine_docs(callback)
207211

208212

209-
def _fetch_dois(params: dict, timeout: float, limit: int):
213+
def _fetch_dois(params: dict, timeout: float, limit: int, session):
210214
dois = []
211215

212216
params = dict(params)
@@ -217,11 +221,7 @@ def _fetch_dois(params: dict, timeout: float, limit: int):
217221
query_string = urlencode(params)
218222
url = CROSSREF_URL + "?" + query_string
219223

220-
try:
221-
response = requests.get(url).json()
222-
except Exception as e:
223-
logging.warn(f"failed to retrieve {url}: {e}")
224-
return None
224+
response = session.get(url).json()
225225

226226
# Status should be "ok"
227227
if response["status"] != "ok":
@@ -251,7 +251,7 @@ def _fetch_dois(params: dict, timeout: float, limit: int):
251251

252252

253253
def search_crossref(
254-
query: str, limit: int = None, timeout: float = 0.5, options: dict = dict()
254+
query: str, *, limit: int = None, timeout: float = 0.5, options: dict = dict(), session=None
255255
) -> DocumentSet:
256256
"""Submit the query to the CrossRef API.
257257
@@ -263,10 +263,14 @@ def search_crossref(
263263
endpoint of CrossRef (see `CrossRef API`
264264
<https://api.crossref.org>`_). Options are `sort` and
265265
`filter`.
266+
:param session: The `requests.Session` to use for HTTP requests.
266267
"""
267268
if not query:
268269
return DocumentSet()
269270

271+
if session is None:
272+
session = requests.Session()
273+
270274
params = dict()
271275
params["query"] = query
272276
params["select"] = "DOI"
@@ -280,13 +284,13 @@ def search_crossref(
280284

281285
with shelve.open(CACHE_FILE) as cache:
282286
if cache_key not in cache:
283-
dois = _fetch_dois(params, timeout, limit)
287+
dois = _fetch_dois(params, timeout, limit, session)
284288
cache[cache_key] = dois
285289
else:
286290
dois = cache[cache_key]
287291

288292
docs = []
289293
for doi in progress_bar(dois):
290-
docs.append(fetch_crossref(doi))
294+
docs.append(fetch_crossref(doi, session=session, timeout=timeout))
291295

292296
return DocumentSet(docs)

litstudy/sources/semanticscholar.py

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -96,48 +96,47 @@ def load(id):
9696
DEFAULT_TIMEOUT = 3.05 # 100 requests per 5 minutes
9797

9898

99-
def request_query(query, offset, limit, cache, timeout=DEFAULT_TIMEOUT):
100-
cache_key = f"results={query};{offset}"
101-
if cache_key in cache:
102-
return cache[cache_key]
103-
104-
url = S2_QUERY_URL
105-
params = dict(offset=offset, query=query, limit=limit)
106-
reply = requests.get(url, params=params)
99+
def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT):
100+
params = urlencode(dict(query=query, offset=offset, limit=limit))
101+
url = f"{S2_QUERY_URL}?{params}"
102+
103+
if url in cache:
104+
return cache[url]
105+
106+
reply = session.get(url)
107107
response = reply.json()
108108

109109
if "data" not in response:
110110
msg = response.get("error") or response.get("message") or "unknown"
111111
raise Exception(f"error while fetching {reply.url}: {msg}")
112112

113-
cache[cache_key] = response
113+
cache[url] = response
114114
return response
115115

116116

117-
def request_paper(key, cache, timeout=DEFAULT_TIMEOUT):
118-
cache_key = urlencode(dict(paper=key))
119-
if cache_key in cache:
120-
return cache[cache_key]
121-
117+
def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT):
122118
url = S2_PAPER_URL + quote_plus(key)
123119

120+
if url in cache:
121+
return cache[url]
122+
124123
try:
125124
sleep(timeout)
126-
data = requests.get(url).json()
125+
data = session.get(url).json()
127126
except Exception as e:
128127
logging.warning(f"failed to retrieve {key}: {e}")
129128
return None
130129

131-
if "paperId" in data:
132-
cache[cache_key] = data
133-
return data
134-
else:
130+
if "paperId" not in data:
135131
msg = data.get("error") or data.get("message") or "unknown error"
136132
logging.warning(f"failed to retrieve {key}: {msg}")
137133
return None
138134

135+
cache[url] = data
136+
return data
139137

140-
def fetch_semanticscholar(key: set) -> Optional[Document]:
138+
139+
def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]:
141140
"""Fetch SemanticScholar metadata for the given key. The key can be
142141
one of the following (see `API reference
143142
<https://www.semanticscholar.org/product/api>`_):
@@ -150,63 +149,76 @@ def fetch_semanticscholar(key: set) -> Optional[Document]:
150149
* PubMed ID (example format: `PMID:19872477`)
151150
* Corpus ID (example format: `CorpusID:37220927`)
152151
152+
:param session: The `requests.Session` to use for HTTP requests.
153153
:returns: The `Document` if it was found and `None` otherwise.
154154
"""
155155

156156
if key is None:
157157
return None
158158

159+
if session is None:
160+
session = requests.Session()
161+
159162
with shelve.open(CACHE_FILE) as cache:
160163
if isinstance(key, DocumentIdentifier):
161164
data = None
162165
if data is None and key.s2id:
163-
data = request_paper(key.s2id, cache)
166+
data = request_paper(key.s2id, cache, session)
164167

165168
if data is None and key.doi:
166-
data = request_paper(key.doi, cache)
169+
data = request_paper(key.doi, cache, session)
167170

168171
if data is None and key.pubmed:
169-
data = request_paper(f"PMID:{key.pubmed}", cache)
172+
data = request_paper(f"PMID:{key.pubmed}", cache, session)
170173

171174
if data is None and key.arxivid:
172-
data = request_paper(f"arXiv:{key.arxivid}", cache)
175+
data = request_paper(f"arXiv:{key.arxivid}", cache, session)
173176
else:
174-
data = request_paper(key, cache)
177+
data = request_paper(key, cache, session)
175178

176179
if data is None:
177180
return None
178181

179182
return ScholarDocument(data)
180183

181184

182-
def refine_semanticscholar(docs: DocumentSet) -> Tuple[DocumentSet, DocumentSet]:
185+
def refine_semanticscholar(docs: DocumentSet, *, session=None) -> Tuple[DocumentSet, DocumentSet]:
183186
"""Attempt to fetch SemanticScholar metadata for each document in the
184187
given set based on their DOIs. Returns a tuple containing two sets: the
185188
documents available on SemanticScholar and the remaining documents that
186189
were not found or do not have a DOI.
190+
191+
:param session: The `requests.Session` to use for HTTP requests.
192+
:returns: The documents available on SemanticScholar and the remaining documents.
187193
"""
188194

189195
def callback(doc):
190196
if isinstance(doc, ScholarDocument):
191197
return doc
192198

193-
return fetch_semanticscholar(doc.id)
199+
return fetch_semanticscholar(doc.id, session)
194200

195201
return docs._refine_docs(callback)
196202

197203

198-
def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 100) -> DocumentSet:
204+
def search_semanticscholar(
205+
query: str, *, limit: int = None, batch_size: int = 100, session=None
206+
) -> DocumentSet:
199207
"""Submit the given query to SemanticScholar API and return the results
200208
as a `DocumentSet`.
201209
202210
:param query: The search query to submit.
203211
:param limit: The maximum number of results to return.
204212
:param batch_size: The number of results to retrieve per request. Must be at most 100.
213+
:param session: The `requests.Session` to use for HTTP requests.
205214
"""
206215

207216
if not query:
208217
raise Exception("no query specified in `search_semanticscholar`")
209218

219+
if session is None:
220+
session = requests.Session()
221+
210222
docs = []
211223

212224
with shelve.open(CACHE_FILE) as cache:
@@ -215,7 +227,7 @@ def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 1
215227
while True:
216228
offset = len(paper_ids)
217229

218-
response = request_query(query, offset, batch_size, cache)
230+
response = request_query(query, offset, batch_size, cache, session)
219231
if not response:
220232
break
221233

@@ -235,7 +247,7 @@ def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 1
235247
break
236248

237249
for paper_id in progress_bar(paper_ids):
238-
doc = request_paper(paper_id, cache)
250+
doc = request_paper(paper_id, cache, session)
239251

240252
if doc:
241253
docs.append(ScholarDocument(doc))

tests/common.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import json
2+
import sys
3+
import re
4+
import os
5+
import requests
6+
import pickle
7+
import hashlib
8+
9+
10+
class MockResponse:
11+
def __init__(self, data):
12+
self.data = data
13+
14+
@property
15+
def status_code(self):
16+
return self.data["status_code"]
17+
18+
@property
19+
def content(self):
20+
return self.data["content"]
21+
22+
def json(self):
23+
return json.loads(self.content)
24+
25+
26+
class MockSession:
27+
def __init__(self, directory=None, allow_requests=None):
28+
if directory is None:
29+
directory = os.path.dirname(os.path.realpath(__file__)) + "/requests/"
30+
31+
if allow_requests is None:
32+
allow_requests = bool(os.environ.get("LITSTUDY_ALLOW_REQUESTS", False))
33+
34+
self.directory = directory
35+
self.allow_requests = allow_requests
36+
37+
def _clean_url(self, url):
38+
return hashlib.sha1(url.encode("utf8")).hexdigest()
39+
40+
def get(self, url):
41+
filename = os.path.join(self.directory, self._clean_url(url) + ".pickle")
42+
43+
if not os.path.exists(filename):
44+
if self.allow_requests:
45+
response = requests.get(url)
46+
data = dict(
47+
url=url,
48+
status_code=response.status_code,
49+
content=response.content,
50+
)
51+
52+
with open(filename, "wb") as f:
53+
f.write(pickle.dumps(data))
54+
else:
55+
raise KeyError(f"URL not registered with MockSession: {url}")
56+
57+
with open(filename, "rb") as f:
58+
return MockResponse(pickle.loads(f.read()))
13.9 KB
Binary file not shown.
347 KB
Binary file not shown.
893 Bytes
Binary file not shown.
Binary file not shown.
254 KB
Binary file not shown.
11.9 KB
Binary file not shown.
12.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)