Skip to content

Commit c85da00

Browse files
rmitschadrianeboyd
andauthored
Add functionality for gzipped .jsonl reading and writing (#84)
* Add functionality for gzipped .jsonl reading and writing. * Remove srsly import. * Skip cloudpickle test test_builtin_classmethod(). * Fix docstring comment for test_read_jsonl_gzip(). * Update srsly/tests/cloudpickle/cloudpickle_test.py * Format. --------- Co-authored-by: Adriane Boyd <[email protected]>
1 parent 3cc7377 commit c85da00

File tree

4 files changed

+155
-29
lines changed

4 files changed

+155
-29
lines changed

README.md

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,11 @@ data = {"foo": "bar", "baz": 123}
136136
srsly.write_json("/path/to/file.json", data)
137137
```
138138

139-
| Argument | Type | Description |
140-
| ---------- | ------------ | ------------------------------------------------------ |
141-
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
142-
| `data` | - | The JSON-serializable data to output. |
143-
| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
139+
| Argument | Type | Description |
140+
| -------- | ------------ | ------------------------------------------------------ |
141+
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
142+
| `data` | - | The JSON-serializable data to output. |
143+
| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
144144

145145
#### <kbd>function</kbd> `srsly.read_json`
146146

@@ -152,7 +152,7 @@ data = srsly.read_json("/path/to/file.json")
152152

153153
| Argument | Type | Description |
154154
| ----------- | ------------ | ------------------------------------------ |
155-
| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
155+
| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
156156
| **RETURNS** | dict / list | The loaded JSON content. |
157157

158158
#### <kbd>function</kbd> `srsly.write_gzip_json`
@@ -164,11 +164,27 @@ data = {"foo": "bar", "baz": 123}
164164
srsly.write_gzip_json("/path/to/file.json.gz", data)
165165
```
166166

167-
| Argument | Type | Description |
168-
| ---------- | ------------ | ------------------------------------------------------ |
169-
| `path` | str / `Path` | The file path. |
170-
| `data` | - | The JSON-serializable data to output. |
171-
| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
167+
| Argument | Type | Description |
168+
| -------- | ------------ | ------------------------------------------------------ |
169+
| `path` | str / `Path` | The file path. |
170+
| `data` | - | The JSON-serializable data to output. |
171+
| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
172+
173+
#### <kbd>function</kbd> `srsly.write_gzip_jsonl`
174+
175+
Create a gzipped JSONL file and dump contents.
176+
177+
```python
178+
data = [{"foo": "bar"}, {"baz": 123}]
179+
srsly.write_gzip_json("/path/to/file.jsonl.gz", data)
180+
```
181+
182+
| Argument | Type | Description |
183+
| ----------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
184+
| `path` | str / `Path` | The file path. |
185+
| `lines` | - | The JSON-serializable contents of each line. |
186+
| `append` | bool | Whether or not to append to the location. Appending to .gz files is generally not recommended, as it doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly compressed. |
187+
| `append_new_line` | bool | Whether or not to write a new line before appending to the file. |
172188

173189
#### <kbd>function</kbd> `srsly.read_gzip_json`
174190

@@ -180,9 +196,22 @@ data = srsly.read_gzip_json("/path/to/file.json.gz")
180196

181197
| Argument | Type | Description |
182198
| ----------- | ------------ | ------------------------ |
183-
| `path` | str / `Path` | The file path. |
199+
| `path` | str / `Path` | The file path. |
184200
| **RETURNS** | dict / list | The loaded JSON content. |
185201

202+
#### <kbd>function</kbd> `srsly.read_gzip_jsonl`
203+
204+
Load gzipped JSONL from a file.
205+
206+
```python
207+
data = srsly.read_gzip_jsonl("/path/to/file.jsonl.gz")
208+
```
209+
210+
| Argument | Type | Description |
211+
| ----------- | ------------ | ------------------------- |
212+
| `path` | str / `Path` | The file path. |
213+
| **RETURNS** | dict / list | The loaded JSONL content. |
214+
186215
#### <kbd>function</kbd> `srsly.write_jsonl`
187216

188217
Create a JSONL file (newline-delimited JSON) and dump contents line by line, or
@@ -195,7 +224,7 @@ srsly.write_jsonl("/path/to/file.jsonl", data)
195224

196225
| Argument | Type | Description |
197226
| ----------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------- |
198-
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
227+
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
199228
| `lines` | iterable | The JSON-serializable lines. |
200229
| `append` | bool | Append to an existing file. Will open it in `"a"` mode and insert a newline before writing lines. Defaults to `False`. |
201230
| `append_new_line` | bool | Defines whether a new line should first be written when appending to an existing file. Defaults to `True`. |
@@ -211,7 +240,7 @@ data = srsly.read_jsonl("/path/to/file.jsonl")
211240

212241
| Argument | Type | Description |
213242
| ---------- | ---------- | -------------------------------------------------------------------- |
214-
| `path` | str / Path | The file path or `"-"` to read from stdin. |
243+
| `path` | str / Path | The file path or `"-"` to read from stdin. |
215244
| `skip` | bool | Skip broken lines and don't raise `ValueError`. Defaults to `False`. |
216245
| **YIELDS** | - | The loaded JSON contents of each line. |
217246

@@ -272,10 +301,10 @@ data = {"foo": "bar", "baz": 123}
272301
srsly.write_msgpack("/path/to/file.msg", data)
273302
```
274303

275-
| Argument | Type | Description |
276-
| ---------- | ------------ | ---------------------- |
277-
| `path` | str / `Path` | The file path. |
278-
| `data` | - | The data to serialize. |
304+
| Argument | Type | Description |
305+
| -------- | ------------ | ---------------------- |
306+
| `path` | str / `Path` | The file path. |
307+
| `data` | - | The data to serialize. |
279308

280309
#### <kbd>function</kbd> `srsly.read_msgpack`
281310

@@ -287,7 +316,7 @@ data = srsly.read_msgpack("/path/to/file.msg")
287316

288317
| Argument | Type | Description |
289318
| ----------- | ------------ | --------------------------------------------------------------------------------------- |
290-
| `path` | str / `Path` | The file path. |
319+
| `path` | str / `Path` | The file path. |
291320
| `use_list` | bool | Don't use tuples instead of lists. Can make deserialization slower. Defaults to `True`. |
292321
| **RETURNS** | - | The loaded and deserialized content. |
293322

@@ -343,7 +372,7 @@ yaml_string = srsly.yaml_dumps(data)
343372
| ----------------- | ---- | ------------------------------------------ |
344373
| `data` | - | The JSON-serializable data to output. |
345374
| `indent_mapping` | int | Mapping indentation. Defaults to `2`. |
346-
| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
375+
| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
347376
| `indent_offset` | int | Indentation offset. Defaults to `2`. |
348377
| `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. |
349378
| **RETURNS** | str | The serialized string. |
@@ -373,10 +402,10 @@ srsly.write_yaml("/path/to/file.yml", data)
373402

374403
| Argument | Type | Description |
375404
| ----------------- | ------------ | ------------------------------------------ |
376-
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
405+
| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
377406
| `data` | - | The JSON-serializable data to output. |
378407
| `indent_mapping` | int | Mapping indentation. Defaults to `2`. |
379-
| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
408+
| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
380409
| `indent_offset` | int | Indentation offset. Defaults to `2`. |
381410
| `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. |
382411

@@ -390,7 +419,7 @@ data = srsly.read_yaml("/path/to/file.yml")
390419

391420
| Argument | Type | Description |
392421
| ----------- | ------------ | ------------------------------------------ |
393-
| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
422+
| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
394423
| **RETURNS** | dict / list | The loaded YAML content. |
395424

396425
#### <kbd>function</kbd> `srsly.is_yaml_serializable`

srsly/_json_api.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Union, Iterable, Sequence, Any, Optional
1+
from typing import Union, Iterable, Sequence, Any, Optional, Iterator
22
import sys
33
import json as _builtin_json
44
import gzip
@@ -56,14 +56,27 @@ def read_json(path: FilePath) -> JSONOutput:
5656
def read_gzip_json(path: FilePath) -> JSONOutput:
5757
"""Load JSON from a gzipped file.
5858
59-
location (FilePath): The file path.
60-
RETURNS (JSONOutput): The loaded JSON content.
59+
location (FilePath): The file path.
60+
RETURNS (JSONOutput): The loaded JSON content.
6161
"""
6262
file_path = force_string(path)
6363
with gzip.open(file_path, "r") as f:
6464
return ujson.load(f)
6565

6666

67+
def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]:
68+
"""Read a gzipped .jsonl file and yield contents line by line.
69+
Blank lines will always be skipped.
70+
71+
path (FilePath): The file path.
72+
skip (bool): Skip broken lines and don't raise ValueError.
73+
YIELDS (JSONOutput): The unpacked, deserialized Python objects.
74+
"""
75+
with gzip.open(force_path(path), "r") as f:
76+
for line in _yield_json_lines(f, skip=skip):
77+
yield line
78+
79+
6780
def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
6881
"""Create a .json file and dump contents or write to standard
6982
output.
@@ -94,6 +107,30 @@ def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
94107
f.write(json_data.encode("utf-8"))
95108

96109

110+
def write_gzip_jsonl(
111+
path: FilePath,
112+
lines: Iterable[JSONInput],
113+
append: bool = False,
114+
append_new_line: bool = True,
115+
) -> None:
116+
"""Create a .jsonl.gz file and dump contents.
117+
118+
location (FilePath): The file path.
119+
lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
120+
append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it
121+
doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly
122+
compressed.
123+
append_new_line (bool): Whether or not to write a new line before appending
124+
to the file.
125+
"""
126+
mode = "a" if append else "w"
127+
file_path = force_path(path, require_exists=False)
128+
with gzip.open(file_path, mode=mode) as f:
129+
if append and append_new_line:
130+
f.write("\n".encode("utf-8"))
131+
f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines])
132+
133+
97134
def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
98135
"""Read a .jsonl file or standard input and yield contents line by line.
99136
Blank lines will always be skipped.

srsly/tests/cloudpickle/cloudpickle_test.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -872,8 +872,10 @@ def test_builtin_classicmethod(self):
872872
@pytest.mark.skipif(
873873
(platform.machine() == "aarch64" and sys.version_info[:2] >= (3, 10))
874874
or platform.python_implementation() == "PyPy"
875-
or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8)),
876-
reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8")
875+
or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8))
876+
# Skipping tests on 3.11 due to https://github.com/cloudpipe/cloudpickle/pull/486.
877+
or sys.version_info[:2] == (3, 11),
878+
reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8+ and 3.11")
877879
def test_builtin_classmethod(self):
878880
obj = 1.5 # float object
879881

srsly/tests/test_json_api.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,14 @@
44
import gzip
55
import numpy
66

7-
from .._json_api import read_json, write_json, read_jsonl, write_jsonl
7+
from .._json_api import (
8+
read_json,
9+
write_json,
10+
read_jsonl,
11+
write_jsonl,
12+
read_gzip_jsonl,
13+
write_gzip_jsonl,
14+
)
815
from .._json_api import write_gzip_json, json_dumps, is_json_serializable
916
from .._json_api import json_loads
1017
from ..util import force_string
@@ -204,3 +211,54 @@ def test_unsupported_type_error():
204211
f = numpy.float32()
205212
with pytest.raises(TypeError):
206213
s = json_dumps(f)
214+
215+
216+
def test_write_jsonl_gzip():
217+
"""Tests writing data to a gzipped .jsonl file."""
218+
data = [{"hello": "world"}, {"test": 123}]
219+
expected = ['{"hello":"world"}\n', '{"test":123}\n']
220+
221+
with make_tempdir() as temp_dir:
222+
file_path = temp_dir / "tmp.json"
223+
write_gzip_jsonl(file_path, data)
224+
with gzip.open(file_path, "r") as f:
225+
assert [line.decode("utf8") for line in f.readlines()] == expected
226+
227+
228+
def test_write_jsonl_gzip_append():
229+
"""Tests appending data to a gzipped .jsonl file."""
230+
data = [{"hello": "world"}, {"test": 123}]
231+
expected = [
232+
'{"hello":"world"}\n',
233+
'{"test":123}\n',
234+
"\n",
235+
'{"hello":"world"}\n',
236+
'{"test":123}\n',
237+
]
238+
with make_tempdir() as temp_dir:
239+
file_path = temp_dir / "tmp.json"
240+
write_gzip_jsonl(file_path, data)
241+
write_gzip_jsonl(file_path, data, append=True)
242+
with gzip.open(file_path, "r") as f:
243+
assert [line.decode("utf8") for line in f.readlines()] == expected
244+
245+
246+
def test_read_jsonl_gzip():
247+
"""Tests reading data from a gzipped .jsonl file."""
248+
file_contents = [{"hello": "world"}, {"test": 123}]
249+
with make_tempdir() as temp_dir:
250+
file_path = temp_dir / "tmp.json"
251+
with gzip.open(file_path, "w") as f:
252+
f.writelines(
253+
[(json_dumps(line) + "\n").encode("utf-8") for line in file_contents]
254+
)
255+
assert file_path.exists()
256+
data = read_gzip_jsonl(file_path)
257+
# Make sure this returns a generator, not just a list
258+
assert not hasattr(data, "__len__")
259+
data = list(data)
260+
assert len(data) == 2
261+
assert len(data[0]) == 1
262+
assert len(data[1]) == 1
263+
assert data[0]["hello"] == "world"
264+
assert data[1]["test"] == 123

0 commit comments

Comments
 (0)