Add functionality for gzipped .jsonl reading and writing (#84)

rmitsch · adrianeboyd · web-flow · commit c85da00d24bd · 2023-02-17T09:41:16.000+01:00
* Add functionality for gzipped .jsonl reading and writing.

* Remove srsly import.

* Skip cloudpickle test test_builtin_classmethod().

* Fix docstring comment for test_read_jsonl_gzip().

* Update srsly/tests/cloudpickle/cloudpickle_test.py

* Format.

---------

Co-authored-by: Adriane Boyd &lt;adrianeboyd@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -136,11 +136,11 @@ data = {"foo": "bar", "baz": 123}
 srsly.write_json("/path/to/file.json", data)
 ```
 
-| Argument   | Type         | Description                                            |
-| ---------- | ------------ | ------------------------------------------------------ |
-| `path` | str / `Path` | The file path or `"-"` to write to stdout.             |
-| `data`     | -            | The JSON-serializable data to output.                  |
-| `indent`   | int          | Number of spaces used to indent JSON. Defaults to `2`. |
+| Argument | Type         | Description                                            |
+| -------- | ------------ | ------------------------------------------------------ |
+| `path`   | str / `Path` | The file path or `"-"` to write to stdout.             |
+| `data`   | -            | The JSON-serializable data to output.                  |
+| `indent` | int          | Number of spaces used to indent JSON. Defaults to `2`. |
 
 #### <kbd>function</kbd> `srsly.read_json`
 
@@ -152,7 +152,7 @@ data = srsly.read_json("/path/to/file.json")
 
 | Argument    | Type         | Description                                |
 | ----------- | ------------ | ------------------------------------------ |
-| `path`  | str / `Path` | The file path or `"-"` to read from stdin. |
+| `path`      | str / `Path` | The file path or `"-"` to read from stdin. |
 | **RETURNS** | dict / list  | The loaded JSON content.                   |
 
 #### <kbd>function</kbd> `srsly.write_gzip_json`
@@ -164,11 +164,27 @@ data = {"foo": "bar", "baz": 123}
 srsly.write_gzip_json("/path/to/file.json.gz", data)
 ```
 
-| Argument   | Type         | Description                                            |
-| ---------- | ------------ | ------------------------------------------------------ |
-| `path` | str / `Path` | The file path.                                         |
-| `data`     | -            | The JSON-serializable data to output.                  |
-| `indent`   | int          | Number of spaces used to indent JSON. Defaults to `2`. |
+| Argument | Type         | Description                                            |
+| -------- | ------------ | ------------------------------------------------------ |
+| `path`   | str / `Path` | The file path.                                         |
+| `data`   | -            | The JSON-serializable data to output.                  |
+| `indent` | int          | Number of spaces used to indent JSON. Defaults to `2`. |
+
+#### <kbd>function</kbd> `srsly.write_gzip_jsonl`
+
+Create a gzipped JSONL file and dump contents.
+
+```python
+data = [{"foo": "bar"}, {"baz": 123}]
+srsly.write_gzip_json("/path/to/file.jsonl.gz", data)
+```
+
+| Argument          | Type         | Description                                                                                                                                                                                                             |
+| ----------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`            | str / `Path` | The file path.                                                                                                                                                                                                          |
+| `lines`           | -            | The JSON-serializable contents of each line.                                                                                                                                                                            |
+| `append`          | bool         | Whether or not to append to the location. Appending to .gz files is generally not recommended, as it doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly compressed. |
+| `append_new_line` | bool         | Whether or not to write a new line before appending to the file.                                                                                                                                                        |
 
 #### <kbd>function</kbd> `srsly.read_gzip_json`
 
@@ -180,9 +196,22 @@ data = srsly.read_gzip_json("/path/to/file.json.gz")
 
 | Argument    | Type         | Description              |
 | ----------- | ------------ | ------------------------ |
-| `path`  | str / `Path` | The file path.           |
+| `path`      | str / `Path` | The file path.           |
 | **RETURNS** | dict / list  | The loaded JSON content. |
 
+#### <kbd>function</kbd> `srsly.read_gzip_jsonl`
+
+Load gzipped JSONL from a file.
+
+```python
+data = srsly.read_gzip_jsonl("/path/to/file.jsonl.gz")
+```
+
+| Argument    | Type         | Description               |
+| ----------- | ------------ | ------------------------- |
+| `path`      | str / `Path` | The file path.            |
+| **RETURNS** | dict / list  | The loaded JSONL content. |
+
 #### <kbd>function</kbd> `srsly.write_jsonl`
 
 Create a JSONL file (newline-delimited JSON) and dump contents line by line, or
@@ -195,7 +224,7 @@ srsly.write_jsonl("/path/to/file.jsonl", data)
 
 | Argument          | Type         | Description                                                                                                            |
 | ----------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------- |
-| `path`        | str / `Path` | The file path or `"-"` to write to stdout.                                                                             |
+| `path`            | str / `Path` | The file path or `"-"` to write to stdout.                                                                             |
 | `lines`           | iterable     | The JSON-serializable lines.                                                                                           |
 | `append`          | bool         | Append to an existing file. Will open it in `"a"` mode and insert a newline before writing lines. Defaults to `False`. |
 | `append_new_line` | bool         | Defines whether a new line should first be written when appending to an existing file. Defaults to `True`.             |
@@ -211,7 +240,7 @@ data = srsly.read_jsonl("/path/to/file.jsonl")
 
 | Argument   | Type       | Description                                                          |
 | ---------- | ---------- | -------------------------------------------------------------------- |
-| `path` | str / Path | The file path or `"-"` to read from stdin.                           |
+| `path`     | str / Path | The file path or `"-"` to read from stdin.                           |
 | `skip`     | bool       | Skip broken lines and don't raise `ValueError`. Defaults to `False`. |
 | **YIELDS** | -          | The loaded JSON contents of each line.                               |
 
@@ -272,10 +301,10 @@ data = {"foo": "bar", "baz": 123}
 srsly.write_msgpack("/path/to/file.msg", data)
 ```
 
-| Argument   | Type         | Description            |
-| ---------- | ------------ | ---------------------- |
-| `path` | str / `Path` | The file path.         |
-| `data`     | -            | The data to serialize. |
+| Argument | Type         | Description            |
+| -------- | ------------ | ---------------------- |
+| `path`   | str / `Path` | The file path.         |
+| `data`   | -            | The data to serialize. |
 
 #### <kbd>function</kbd> `srsly.read_msgpack`
 
@@ -287,7 +316,7 @@ data = srsly.read_msgpack("/path/to/file.msg")
 
 | Argument    | Type         | Description                                                                             |
 | ----------- | ------------ | --------------------------------------------------------------------------------------- |
-| `path`  | str / `Path` | The file path.                                                                          |
+| `path`      | str / `Path` | The file path.                                                                          |
 | `use_list`  | bool         | Don't use tuples instead of lists. Can make deserialization slower. Defaults to `True`. |
 | **RETURNS** | -            | The loaded and deserialized content.                                                    |
 
@@ -343,7 +372,7 @@ yaml_string = srsly.yaml_dumps(data)
 | ----------------- | ---- | ------------------------------------------ |
 | `data`            | -    | The JSON-serializable data to output.      |
 | `indent_mapping`  | int  | Mapping indentation. Defaults to `2`.      |
-| `indent_sequence` | int  | Sequence indentation. Defaults to `4`.      |
+| `indent_sequence` | int  | Sequence indentation. Defaults to `4`.     |
 | `indent_offset`   | int  | Indentation offset. Defaults to `2`.       |
 | `sort_keys`       | bool | Sort dictionary keys. Defaults to `False`. |
 | **RETURNS**       | str  | The serialized string.                     |
@@ -373,10 +402,10 @@ srsly.write_yaml("/path/to/file.yml", data)
 
 | Argument          | Type         | Description                                |
 | ----------------- | ------------ | ------------------------------------------ |
-| `path`        | str / `Path` | The file path or `"-"` to write to stdout. |
+| `path`            | str / `Path` | The file path or `"-"` to write to stdout. |
 | `data`            | -            | The JSON-serializable data to output.      |
 | `indent_mapping`  | int          | Mapping indentation. Defaults to `2`.      |
-| `indent_sequence` | int          | Sequence indentation. Defaults to `4`.      |
+| `indent_sequence` | int          | Sequence indentation. Defaults to `4`.     |
 | `indent_offset`   | int          | Indentation offset. Defaults to `2`.       |
 | `sort_keys`       | bool         | Sort dictionary keys. Defaults to `False`. |
 
@@ -390,7 +419,7 @@ data = srsly.read_yaml("/path/to/file.yml")
 
 | Argument    | Type         | Description                                |
 | ----------- | ------------ | ------------------------------------------ |
-| `path`  | str / `Path` | The file path or `"-"` to read from stdin. |
+| `path`      | str / `Path` | The file path or `"-"` to read from stdin. |
 | **RETURNS** | dict / list  | The loaded YAML content.                   |
 
 #### <kbd>function</kbd> `srsly.is_yaml_serializable`
diff --git a/srsly/_json_api.py b/srsly/_json_api.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterable, Sequence, Any, Optional
+from typing import Union, Iterable, Sequence, Any, Optional, Iterator
 import sys
 import json as _builtin_json
 import gzip
@@ -56,14 +56,27 @@ def read_json(path: FilePath) -> JSONOutput:
 def read_gzip_json(path: FilePath) -> JSONOutput:
     """Load JSON from a gzipped file.
 
-        location (FilePath): The file path.
-        RETURNS (JSONOutput): The loaded JSON content.
+    location (FilePath): The file path.
+    RETURNS (JSONOutput): The loaded JSON content.
     """
     file_path = force_string(path)
     with gzip.open(file_path, "r") as f:
         return ujson.load(f)
 
 
+def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]:
+    """Read a gzipped .jsonl file and yield contents line by line.
+    Blank lines will always be skipped.
+
+    path (FilePath): The file path.
+    skip (bool): Skip broken lines and don't raise ValueError.
+    YIELDS (JSONOutput): The unpacked, deserialized Python objects.
+    """
+    with gzip.open(force_path(path), "r") as f:
+        for line in _yield_json_lines(f, skip=skip):
+            yield line
+
+
 def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
     """Create a .json file and dump contents or write to standard
     output.
@@ -94,6 +107,30 @@ def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
         f.write(json_data.encode("utf-8"))
 
 
+def write_gzip_jsonl(
+    path: FilePath,
+    lines: Iterable[JSONInput],
+    append: bool = False,
+    append_new_line: bool = True,
+) -> None:
+    """Create a .jsonl.gz file and dump contents.
+
+    location (FilePath): The file path.
+    lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
+    append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it
+        doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly
+        compressed.
+    append_new_line (bool): Whether or not to write a new line before appending
+        to the file.
+    """
+    mode = "a" if append else "w"
+    file_path = force_path(path, require_exists=False)
+    with gzip.open(file_path, mode=mode) as f:
+        if append and append_new_line:
+            f.write("\n".encode("utf-8"))
+        f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines])
+
+
 def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
     """Read a .jsonl file or standard input and yield contents line by line.
     Blank lines will always be skipped.
diff --git a/srsly/tests/cloudpickle/cloudpickle_test.py b/srsly/tests/cloudpickle/cloudpickle_test.py
@@ -872,8 +872,10 @@ def test_builtin_classicmethod(self):
     @pytest.mark.skipif(
         (platform.machine() == "aarch64" and sys.version_info[:2] >= (3, 10))
             or platform.python_implementation() == "PyPy"
-            or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8)),
-        reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8")
+            or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8))
+            # Skipping tests on 3.11 due to https://github.com/cloudpipe/cloudpickle/pull/486.
+            or sys.version_info[:2] == (3, 11),
+        reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8+ and 3.11")
     def test_builtin_classmethod(self):
         obj = 1.5  # float object
 
diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py
@@ -4,7 +4,14 @@
 import gzip
 import numpy
 
-from .._json_api import read_json, write_json, read_jsonl, write_jsonl
+from .._json_api import (
+    read_json,
+    write_json,
+    read_jsonl,
+    write_jsonl,
+    read_gzip_jsonl,
+    write_gzip_jsonl,
+)
 from .._json_api import write_gzip_json, json_dumps, is_json_serializable
 from .._json_api import json_loads
 from ..util import force_string
@@ -204,3 +211,54 @@ def test_unsupported_type_error():
     f = numpy.float32()
     with pytest.raises(TypeError):
         s = json_dumps(f)
+
+
+def test_write_jsonl_gzip():
+    """Tests writing data to a gzipped .jsonl file."""
+    data = [{"hello": "world"}, {"test": 123}]
+    expected = ['{"hello":"world"}\n', '{"test":123}\n']
+
+    with make_tempdir() as temp_dir:
+        file_path = temp_dir / "tmp.json"
+        write_gzip_jsonl(file_path, data)
+        with gzip.open(file_path, "r") as f:
+            assert [line.decode("utf8") for line in f.readlines()] == expected
+
+
+def test_write_jsonl_gzip_append():
+    """Tests appending data to a gzipped .jsonl file."""
+    data = [{"hello": "world"}, {"test": 123}]
+    expected = [
+        '{"hello":"world"}\n',
+        '{"test":123}\n',
+        "\n",
+        '{"hello":"world"}\n',
+        '{"test":123}\n',
+    ]
+    with make_tempdir() as temp_dir:
+        file_path = temp_dir / "tmp.json"
+        write_gzip_jsonl(file_path, data)
+        write_gzip_jsonl(file_path, data, append=True)
+        with gzip.open(file_path, "r") as f:
+            assert [line.decode("utf8") for line in f.readlines()] == expected
+
+
+def test_read_jsonl_gzip():
+    """Tests reading data from a gzipped .jsonl file."""
+    file_contents = [{"hello": "world"}, {"test": 123}]
+    with make_tempdir() as temp_dir:
+        file_path = temp_dir / "tmp.json"
+        with gzip.open(file_path, "w") as f:
+            f.writelines(
+                [(json_dumps(line) + "\n").encode("utf-8") for line in file_contents]
+            )
+        assert file_path.exists()
+        data = read_gzip_jsonl(file_path)
+        # Make sure this returns a generator, not just a list
+        assert not hasattr(data, "__len__")
+        data = list(data)
+    assert len(data) == 2
+    assert len(data[0]) == 1
+    assert len(data[1]) == 1
+    assert data[0]["hello"] == "world"
+    assert data[1]["test"] == 123