Skip to content

Commit d94895e

Browse files
ariostasiannahenryiii
authored
chore(ci): bump minimal pyarrow version being tested (#3584)
* fix: arrow can handle length-zero arrays now Co-authored-by: Ianna Osborne <[email protected]> * Bump minimal pyarrow version to 14.0.0 Co-authored-by: Henry Schreiner <[email protected]> * Try pyarrow 15 * Back to pyarrow 14, but install pandas --------- Co-authored-by: Ianna Osborne <[email protected]> Co-authored-by: Henry Schreiner <[email protected]>
1 parent d003b1f commit d94895e

File tree

3 files changed

+48
-85
lines changed

3 files changed

+48
-85
lines changed

requirements-test-minimal.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
fsspec>=2022.11.0;sys_platform != "win32"
22
numpy==1.19.3
3-
pyarrow==7.0.0
3+
pandas==1.1.3
4+
pyarrow==14.0.0
45
pytest>=6
56
pytest-cov
67
pytest-xdist

src/awkward/operations/str/__init__.py

Lines changed: 37 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -236,95 +236,58 @@ def _get_split_action(
236236
bytestring_to_string=False,
237237
**kwargs,
238238
):
239-
from awkward._backends.typetracer import TypeTracerBackend
240-
from awkward.forms import ListOffsetForm, NumpyForm
241-
242-
typetracer = TypeTracerBackend.instance()
243-
244-
# FIXME: this workaround for typetracer is required because
245-
# split_XXX does not support length-zero arrays
246-
# c.f. https://github.com/apache/arrow/issues/37437
247239
def action(layout, **_):
248-
if layout.backend is typetracer:
249-
if layout.is_list and layout.parameter("__array__") == "string":
250-
return (
251-
ListOffsetForm(
252-
"i32",
253-
ListOffsetForm(
254-
layout.form.offsets,
255-
NumpyForm("uint8", parameters={"__array__": "char"}),
240+
if layout.is_list and layout.parameter("__array__") == "string":
241+
return _drop_option_preserving_form(
242+
_apply_through_arrow(
243+
utf8_function,
244+
layout,
245+
*args,
246+
generate_bitmasks=generate_bitmasks,
247+
**kwargs,
248+
)
249+
)
250+
251+
elif layout.is_list and layout.parameter("__array__") == "bytestring":
252+
if bytestring_to_string:
253+
out = _drop_option_preserving_form(
254+
_apply_through_arrow(
255+
ascii_function,
256+
layout.copy(
257+
content=layout.content.copy(
258+
parameters={"__array__": "char"}
259+
),
256260
parameters={"__array__": "string"},
257261
),
262+
*args,
263+
generate_bitmasks=generate_bitmasks,
264+
**kwargs,
258265
)
259-
.length_zero_array()
260-
.to_typetracer(forget_length=True)
261266
)
267+
assert out.is_list
262268

263-
elif layout.is_list and layout.parameter("__array__") == "bytestring":
264-
return (
265-
ListOffsetForm(
266-
"i32",
267-
ListOffsetForm(
268-
layout.form.offsets,
269-
NumpyForm("uint8", parameters={"__array__": "byte"}),
270-
parameters={"__array__": "bytestring"},
269+
assert (
270+
out.content.is_list
271+
and out.content.parameter("__array__") == "string"
272+
)
273+
return out.copy(
274+
content=out.content.copy(
275+
content=out.content.content.copy(
276+
parameters={"__array__": "byte"}
271277
),
272-
)
273-
.length_zero_array()
274-
.to_typetracer(forget_length=True)
278+
parameters={"__array__": "bytestring"},
279+
),
275280
)
276-
else:
277-
if layout.is_list and layout.parameter("__array__") == "string":
281+
282+
else:
278283
return _drop_option_preserving_form(
279284
_apply_through_arrow(
280-
utf8_function,
285+
ascii_function,
281286
layout,
282287
*args,
283288
generate_bitmasks=generate_bitmasks,
284289
**kwargs,
285290
)
286291
)
287292

288-
elif layout.is_list and layout.parameter("__array__") == "bytestring":
289-
if bytestring_to_string:
290-
out = _drop_option_preserving_form(
291-
_apply_through_arrow(
292-
ascii_function,
293-
layout.copy(
294-
content=layout.content.copy(
295-
parameters={"__array__": "char"}
296-
),
297-
parameters={"__array__": "string"},
298-
),
299-
*args,
300-
generate_bitmasks=generate_bitmasks,
301-
**kwargs,
302-
)
303-
)
304-
assert out.is_list
305-
306-
assert (
307-
out.content.is_list
308-
and out.content.parameter("__array__") == "string"
309-
)
310-
return out.copy(
311-
content=out.content.copy(
312-
content=out.content.content.copy(
313-
parameters={"__array__": "byte"}
314-
),
315-
parameters={"__array__": "bytestring"},
316-
),
317-
)
318-
319-
else:
320-
return _drop_option_preserving_form(
321-
_apply_through_arrow(
322-
ascii_function,
323-
layout,
324-
*args,
325-
generate_bitmasks=generate_bitmasks,
326-
**kwargs,
327-
)
328-
)
329-
330293
return action

tests/test_2616_use_pyarrow_for_strings.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -881,16 +881,15 @@ def test_slice():
881881
== ak.str.slice(ak.to_backend(bytestring, "typetracer"), 1, 3).layout.form
882882
)
883883

884-
# ArrowInvalid: Negative buffer resize: -40 (looks like an Arrow bug)
885-
# assert ak.str.slice(string, 1).tolist() == [
886-
# ["αβγ"[1:], ""[1:]],
887-
# [],
888-
# ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]],
889-
# ]
890-
# assert (
891-
# ak.str.slice(string, 1).layout.form
892-
# == ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form
893-
# )
884+
assert ak.str.slice(string, 1).tolist() == [
885+
["αβγ"[1:], ""[1:]],
886+
[],
887+
["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]],
888+
]
889+
assert (
890+
ak.str.slice(string, 1).layout.form
891+
== ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form
892+
)
894893
assert ak.str.slice(bytestring, 1).tolist() == [
895894
["αβγ".encode()[1:], b""[1:]],
896895
[],

0 commit comments

Comments
 (0)