Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion srsly/_json_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Iterable, Sequence, Any, Optional, Iterator
from typing import Any, Iterable, Dict, List, Optional, Iterator, Union, Type, cast
import sys
import json as _builtin_json
import gzip
Expand Down Expand Up @@ -39,6 +39,32 @@ def json_loads(data: Union[str, bytes]) -> JSONOutput:
return ujson.loads(data)


def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]:
"""Deserialize unicode or bytes to a Python dict.

data (str / bytes): The data to deserialize.
RAISES: ValueError if the loaded data is not a dict
RETURNS: The deserialized Python dict.
"""
obj = json_loads(data)
if not isinstance(obj, dict):
raise ValueError("JSON data could not be parsed to a dict.")
return obj


def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, Any]]:
Copy link
Contributor

@adrianeboyd adrianeboyd Mar 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and read_json_list only check List not List[Dict[str, Any]]? I wouldn't expect a function that's called read_json_list to have the additional dict behavior/type?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right this part is a little weird. Again, what I want 99% of the time when calling read_json_list is to have a list of dicts. But the naming does leave something to be desired. There is some validation in place for the read_json_list function, just not for the json_loads_list. I can copy the validation over there though.

Is read_json_dicts a better name here? similar to the read_jsonl_dicts.

"""Deserialize unicode or bytes to a Python list of dicts.

data (str / bytes): The data to deserialize.
RAISES: ValueError if the loaded data is not a list
RETURNS: The deserialized Python list.
"""
loaded = json_loads(data)
if not isinstance(loaded, list):
raise ValueError("JSON data could not be parsed to a list of dicts.")
return loaded


def read_json(path: FilePath) -> JSONOutput:
"""Load JSON from file or standard input.

Expand All @@ -53,6 +79,42 @@ def read_json(path: FilePath) -> JSONOutput:
return ujson.load(f)


def read_json_dict(path: FilePath) -> Dict[str, Any]:
"""Load JSON from file or standard input.

path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""
data = read_json(path)
if not isinstance(data, dict):
raise ValueError("Invalid JSON, data could not be parsed to a dict.")
return data


def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: bool = False) -> List[Dict[str, Any]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still a bit of a typing problem because the return type depends on the value of validate_inner.

I think this method with these options and types may be getting too specific for inclusion in the srsly API.

If it is going to be in srsly, then I think it would be better as read_json_list() -> List and read_json_list_of_dicts() -> List[Dict[str, JSONOutput]] or something along those lines?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed that was weird. I fixed these types and separated this logic into 2 functions.

This parsing as a list idea is kinda extra honestly. I think the main functions I really think should live in srsly are:

  • read_json_dict -> We parse a normal JSON file expecting a JSON object (e.g. for configs and stuff) all the time
  • read_jsonl_dicts -> We use this to load data all the time and expect each line to be a valid JSON object

"""Load JSON from file or standard input.

path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""

data = read_json(path)
if not isinstance(data, list):
raise ValueError("Invalid JSON, data could not be parsed to a list of dicts.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think describing any of the input as "invalid JSON" is going to be confusing for users. Something like "invalid value" instead?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah good point, I changed this language so it's reads that it can't be parsed to the expected type


if validate_inner:
output = []
for i, obj in enumerate(data):
if not isinstance(obj, dict):
if skip_invalid:
continue
raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.")
output.append(obj)
else:
output = data
return data


def read_gzip_json(path: FilePath) -> JSONOutput:
"""Load JSON from a gzipped file.

Expand Down Expand Up @@ -149,6 +211,22 @@ def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
yield line


def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, Any]]:
"""Read a .jsonl file or standard input and yield contents line by line.
Blank lines will always be skipped. Validates the contents of each line is a dict.

path (FilePath): The file path. "-" for reading from stdin.
skip (bool): Skip broken lines and don't raise ValueError.
YIELDS (JSONOutput): The loaded JSON contents of each line.
"""
for i, line in enumerate(read_jsonl(path, skip=skip)):
if not isinstance(line, dict):
if skip:
continue
raise ValueError(f"Invalid JSON Object on line: {i + 1}. Line is not a valid dict.")
yield line


def write_jsonl(
path: FilePath,
lines: Iterable[JSONInput],
Expand Down
42 changes: 42 additions & 0 deletions srsly/tests/test_json_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
import gzip
import numpy

from typing import Any, Dict, List, Union

from .._json_api import (
read_json,
read_json_dict,
read_json_list,
read_jsonl_dicts,
write_json,
read_jsonl,
write_jsonl,
Expand Down Expand Up @@ -262,3 +267,40 @@ def test_read_jsonl_gzip():
assert len(data[1]) == 1
assert data[0]["hello"] == "world"
assert data[1]["test"] == 123


READ_JSON_DICT_TEST_CASES = {

}


READ_JSONL_DICT_TEST_CASES = {
"invalid_str": ('"test"', ValueError()),
"invalid_num": ('-32', ValueError()),
"invalid_json_list": ('[{"hello": "world"}\n{"test": 123}]', ValueError()),
"valid_dicts": ('{"hello": "world"}\n{"test": 123}', [{"hello": "world"}, {"test": 123}]),
}

@pytest.mark.parametrize(
"file_contents, expected",
READ_JSONL_DICT_TEST_CASES.values(),
ids=READ_JSONL_DICT_TEST_CASES.keys()
)
def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, Any]], ValueError]):

with make_tempdir({"tmp.json": file_contents}) as temp_dir:
file_path = temp_dir / "tmp.json"
assert file_path.exists()
data = read_jsonl_dicts(file_path)
# Make sure this returns a generator, not just a list
assert not hasattr(data, "__len__")
try:
# actually consume the generator to trigger errors
data = list(data)
except ValueError:
assert isinstance(expected, ValueError)
else:
assert isinstance(expected, list)
assert len(data) == len(expected)
for data_item, expected_item in zip(data, expected):
assert data_item == expected_item