NLeSC
diff --git a/‎litstudy/sources/crossref.py‎
Lines changed: 19 additions & 15 deletions b/‎litstudy/sources/crossref.py‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎litstudy/sources/semanticscholar.py‎
Lines changed: 42 additions & 30 deletions b/‎litstudy/sources/semanticscholar.py‎
Lines changed: 42 additions & 30 deletions
diff --git a/‎tests/common.py‎
Lines changed: 58 additions & 0 deletions b/‎tests/common.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle‎
13.9 KB b/‎tests/requests/245f82b3fdc09eaed6a726cd4bddaa2f1565ba90.pickle‎
13.9 KB
diff --git a/‎tests/requests/4d39f93aa1c9ff4afee9f210b14ade9e5ccf3a58.pickle‎
347 KB b/‎tests/requests/4d39f93aa1c9ff4afee9f210b14ade9e5ccf3a58.pickle‎
347 KB
diff --git a/‎tests/requests/4eea59c658fa3076445495dea5554977b85511ff.pickle‎
893 Bytes b/‎tests/requests/4eea59c658fa3076445495dea5554977b85511ff.pickle‎
893 Bytes
diff --git a/‎tests/requests/5122a6fa38e030c8876096317e7e19aa6534e70a.pickle‎
42 KB b/‎tests/requests/5122a6fa38e030c8876096317e7e19aa6534e70a.pickle‎
42 KB
diff --git a/‎tests/requests/54f7b47eb9ceb574f63ec43e8b717364b75b3fa7.pickle‎
254 KB b/‎tests/requests/54f7b47eb9ceb574f63ec43e8b717364b75b3fa7.pickle‎
254 KB
diff --git a/‎tests/requests/5dec8d884e7221dc8cad8d779c23884c91fde749.pickle‎
11.9 KB b/‎tests/requests/5dec8d884e7221dc8cad8d779c23884c91fde749.pickle‎
11.9 KB
diff --git a/‎tests/requests/6028198cfd0c1f6c2e2b995ed4802d1c42fb07b2.pickle‎
12.8 KB b/‎tests/requests/6028198cfd0c1f6c2e2b995ed4802d1c42fb07b2.pickle‎
12.8 KB
@@ -143,9 +143,11 @@ def load(doi):
 CROSSREF_URL = "https://api.crossref.org/works/"
 
 
-def fetch_crossref(doi: str, timeout=0.5) -> Optional[Document]:
+def fetch_crossref(doi: str, *, timeout=0.5, session=None) -> Optional[Document]:
     """Fetch the metadata for the given DOI from CrossRef.
 
+    :param timeout: The timeout between each HTTP request in seconds.
+    :param session: The `requests.Session` to use for HTTP requests.
     :returns: The `Document` or `None` if the DOI was not available.
     """
 
@@ -160,7 +162,7 @@ def request():
             url = CROSSREF_URL + quote_plus(doi)
 
             try:
-                response = requests.get(url)
+                response = session.get(url)
             except Exception as e:
                 logging.warn(f"failed to retrieve {doi}: {e}")
                 return None
@@ -188,25 +190,27 @@ def request():
     return CrossRefDocument(data) if data else None
 
 
-def refine_crossref(docs: DocumentSet, timeout=0.5) -> Tuple[DocumentSet, DocumentSet]:
+def refine_crossref(
+    docs: DocumentSet, *, timeout=0.5, session=None
+) -> Tuple[DocumentSet, DocumentSet]:
     """Attempts to fetch metadata from CrossRef for each document in the given
     set. Returns a tuple of two sets: the documents retrieved from CrossRef
     and the remaining documents (i.e., without DOI or not found).
 
-    :param timeout: Timeout in seconds between each request to throttle
-        server communication.
+    :param timeout: Timeout in seconds between each request to throttle server communication.
+    :param session: The `requests.Session` to use for HTTP requests.
     """
 
     def callback(doc):
         if isinstance(doc, CrossRefDocument):
             return doc
 
-        return fetch_crossref(doc.id.doi, timeout)
+        return fetch_crossref(doc.id.doi, timeout=timeout, session=session)
 
     return docs._refine_docs(callback)
 
 
-def _fetch_dois(params: dict, timeout: float, limit: int):
+def _fetch_dois(params: dict, timeout: float, limit: int, session):
     dois = []
 
     params = dict(params)
@@ -217,11 +221,7 @@ def _fetch_dois(params: dict, timeout: float, limit: int):
         query_string = urlencode(params)
         url = CROSSREF_URL + "?" + query_string
 
-        try:
-            response = requests.get(url).json()
-        except Exception as e:
-            logging.warn(f"failed to retrieve {url}: {e}")
-            return None
+        response = session.get(url).json()
 
         # Status should be "ok"
         if response["status"] != "ok":
@@ -251,7 +251,7 @@ def _fetch_dois(params: dict, timeout: float, limit: int):
 
 
 def search_crossref(
-    query: str, limit: int = None, timeout: float = 0.5, options: dict = dict()
+    query: str, *, limit: int = None, timeout: float = 0.5, options: dict = dict(), session=None
 ) -> DocumentSet:
     """Submit the query to the CrossRef API.
 
@@ -263,10 +263,14 @@ def search_crossref(
                     endpoint of CrossRef (see `CrossRef API`
                     <https://api.crossref.org>`_). Options are `sort` and
                     `filter`.
+    :param session: The `requests.Session` to use for HTTP requests.
     """
     if not query:
         return DocumentSet()
 
+    if session is None:
+        session = requests.Session()
+
     params = dict()
     params["query"] = query
     params["select"] = "DOI"
@@ -280,13 +284,13 @@ def search_crossref(
 
     with shelve.open(CACHE_FILE) as cache:
         if cache_key not in cache:
-            dois = _fetch_dois(params, timeout, limit)
+            dois = _fetch_dois(params, timeout, limit, session)
             cache[cache_key] = dois
         else:
             dois = cache[cache_key]
 
     docs = []
     for doi in progress_bar(dois):
-        docs.append(fetch_crossref(doi))
+        docs.append(fetch_crossref(doi, session=session, timeout=timeout))
 
     return DocumentSet(docs)
@@ -96,48 +96,47 @@ def load(id):
 DEFAULT_TIMEOUT = 3.05  # 100 requests per 5 minutes
 
 
-def request_query(query, offset, limit, cache, timeout=DEFAULT_TIMEOUT):
-    cache_key = f"results={query};{offset}"
-    if cache_key in cache:
-        return cache[cache_key]
-
-    url = S2_QUERY_URL
-    params = dict(offset=offset, query=query, limit=limit)
-    reply = requests.get(url, params=params)
+def request_query(query, offset, limit, cache, session, timeout=DEFAULT_TIMEOUT):
+    params = urlencode(dict(query=query, offset=offset, limit=limit))
+    url = f"{S2_QUERY_URL}?{params}"
+
+    if url in cache:
+        return cache[url]
+
+    reply = session.get(url)
     response = reply.json()
 
     if "data" not in response:
         msg = response.get("error") or response.get("message") or "unknown"
         raise Exception(f"error while fetching {reply.url}: {msg}")
 
-    cache[cache_key] = response
+    cache[url] = response
     return response
 
 
-def request_paper(key, cache, timeout=DEFAULT_TIMEOUT):
-    cache_key = urlencode(dict(paper=key))
-    if cache_key in cache:
-        return cache[cache_key]
-
+def request_paper(key, cache, session, timeout=DEFAULT_TIMEOUT):
     url = S2_PAPER_URL + quote_plus(key)
 
+    if url in cache:
+        return cache[url]
+
     try:
         sleep(timeout)
-        data = requests.get(url).json()
+        data = session.get(url).json()
     except Exception as e:
         logging.warning(f"failed to retrieve {key}: {e}")
         return None
 
-    if "paperId" in data:
-        cache[cache_key] = data
-        return data
-    else:
+    if "paperId" not in data:
         msg = data.get("error") or data.get("message") or "unknown error"
         logging.warning(f"failed to retrieve {key}: {msg}")
         return None
 
+    cache[url] = data
+    return data
 
-def fetch_semanticscholar(key: set) -> Optional[Document]:
+
+def fetch_semanticscholar(key: set, *, session=None) -> Optional[Document]:
     """Fetch SemanticScholar metadata for the given key. The key can be
     one of the following (see `API reference
     <https://www.semanticscholar.org/product/api>`_):
@@ -150,63 +149,76 @@ def fetch_semanticscholar(key: set) -> Optional[Document]:
     * PubMed ID (example format: `PMID:19872477`)
     * Corpus ID (example format: `CorpusID:37220927`)
 
+    :param session: The `requests.Session` to use for HTTP requests.
     :returns: The `Document` if it was found and `None` otherwise.
     """
 
     if key is None:
         return None
 
+    if session is None:
+        session = requests.Session()
+
     with shelve.open(CACHE_FILE) as cache:
         if isinstance(key, DocumentIdentifier):
             data = None
             if data is None and key.s2id:
-                data = request_paper(key.s2id, cache)
+                data = request_paper(key.s2id, cache, session)
 
             if data is None and key.doi:
-                data = request_paper(key.doi, cache)
+                data = request_paper(key.doi, cache, session)
 
             if data is None and key.pubmed:
-                data = request_paper(f"PMID:{key.pubmed}", cache)
+                data = request_paper(f"PMID:{key.pubmed}", cache, session)
 
             if data is None and key.arxivid:
-                data = request_paper(f"arXiv:{key.arxivid}", cache)
+                data = request_paper(f"arXiv:{key.arxivid}", cache, session)
         else:
-            data = request_paper(key, cache)
+            data = request_paper(key, cache, session)
 
     if data is None:
         return None
 
     return ScholarDocument(data)
 
 
-def refine_semanticscholar(docs: DocumentSet) -> Tuple[DocumentSet, DocumentSet]:
+def refine_semanticscholar(docs: DocumentSet, *, session=None) -> Tuple[DocumentSet, DocumentSet]:
     """Attempt to fetch SemanticScholar metadata for each document in the
     given set based on their DOIs. Returns a tuple containing two sets: the
     documents available on SemanticScholar and the remaining documents that
     were not found or do not have a DOI.
+
+    :param session: The `requests.Session` to use for HTTP requests.
+    :returns: The documents available on SemanticScholar and the remaining documents.
     """
 
     def callback(doc):
         if isinstance(doc, ScholarDocument):
             return doc
 
-        return fetch_semanticscholar(doc.id)
+        return fetch_semanticscholar(doc.id, session)
 
     return docs._refine_docs(callback)
 
 
-def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 100) -> DocumentSet:
+def search_semanticscholar(
+    query: str, *, limit: int = None, batch_size: int = 100, session=None
+) -> DocumentSet:
     """Submit the given query to SemanticScholar API and return the results
     as a `DocumentSet`.
 
     :param query: The search query to submit.
     :param limit: The maximum number of results to return.
     :param batch_size: The number of results to retrieve per request. Must be at most 100.
+    :param session: The `requests.Session` to use for HTTP requests.
     """
 
     if not query:
         raise Exception("no query specified in `search_semanticscholar`")
 
+    if session is None:
+        session = requests.Session()
+
     docs = []
 
     with shelve.open(CACHE_FILE) as cache:
@@ -215,7 +227,7 @@ def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 1
         while True:
             offset = len(paper_ids)
 
-            response = request_query(query, offset, batch_size, cache)
+            response = request_query(query, offset, batch_size, cache, session)
             if not response:
                 break
 
@@ -235,7 +247,7 @@ def search_semanticscholar(query: str, *, limit: int = None, batch_size: int = 1
                 break
 
         for paper_id in progress_bar(paper_ids):
-            doc = request_paper(paper_id, cache)
+            doc = request_paper(paper_id, cache, session)
 
             if doc:
                 docs.append(ScholarDocument(doc))
 
@@ -0,0 +1,58 @@
+import json
+import sys
+import re
+import os
+import requests
+import pickle
+import hashlib
+
+
+class MockResponse:
+    def __init__(self, data):
+        self.data = data
+
+    @property
+    def status_code(self):
+        return self.data["status_code"]
+
+    @property
+    def content(self):
+        return self.data["content"]
+
+    def json(self):
+        return json.loads(self.content)
+
+
+class MockSession:
+    def __init__(self, directory=None, allow_requests=None):
+        if directory is None:
+            directory = os.path.dirname(os.path.realpath(__file__)) + "/requests/"
+
+        if allow_requests is None:
+            allow_requests = bool(os.environ.get("LITSTUDY_ALLOW_REQUESTS", False))
+
+        self.directory = directory
+        self.allow_requests = allow_requests
+
+    def _clean_url(self, url):
+        return hashlib.sha1(url.encode("utf8")).hexdigest()
+
+    def get(self, url):
+        filename = os.path.join(self.directory, self._clean_url(url) + ".pickle")
+
+        if not os.path.exists(filename):
+            if self.allow_requests:
+                response = requests.get(url)
+                data = dict(
+                    url=url,
+                    status_code=response.status_code,
+                    content=response.content,
+                )
+
+                with open(filename, "wb") as f:
+                    f.write(pickle.dumps(data))
+            else:
+                raise KeyError(f"URL not registered with MockSession: {url}")
+
+        with open(filename, "rb") as f:
+            return MockResponse(pickle.loads(f.read()))