Skip to content

Commit a6f5377

Browse files
author
Tavian Barnes
authored
Merge pull request #18 from microsoft/tokenization-slice
token: Don't trim Tokenization text when slicing
2 parents 684ec3f + 7c401b6 commit a6f5377

File tree

4 files changed

+71
-99
lines changed

4 files changed

+71
-99
lines changed

js/src/token.ts

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -143,54 +143,34 @@ export class Tokenization {
143143
* The requested slice as a new `Tokenization`.
144144
*/
145145
slice(start?: number, end?: number): Tokenization {
146-
if (start === undefined) {
147-
return new Tokenization(this.text, this.tokens);
148-
}
149-
if (end === undefined) {
150-
end = this.length;
151-
}
152-
if (start < 0) {
153-
start += this.length;
154-
}
155-
if (end < 0) {
156-
end += this.length;
157-
}
158-
if (end < start) {
159-
end = start;
160-
}
161-
162-
const substring = this.substring(start, end);
163-
const tokens = this.tokens.slice(start, end);
164-
if (tokens.length > 0) {
165-
const delta = tokens[0].start;
166-
for (const i in tokens) {
167-
const token = tokens[i];
168-
tokens[i] = new Token(token.text, token.start - delta, token.end - delta);
169-
}
170-
}
171-
172-
return new Tokenization(substring, tokens);
146+
return new Tokenization(this.text, this.tokens.slice(start, end));
173147
}
174148

175149
/**
176150
* Map a span of tokens to the corresponding substring.
177151
*/
178-
substring(start: number, end: number): BiString {
152+
substring(start?: number, end?: number): BiString {
179153
const [first, last] = this.textBounds(start, end);
180154
return this.text.substring(first, last);
181155
}
182156

183157
/**
184158
* Map a span of tokens to the bounds of the corresponding text.
185159
*/
186-
textBounds(start: number, end: number): Bounds {
160+
textBounds(start?: number, end?: number): Bounds {
161+
if (start === undefined) {
162+
start = 0;
163+
}
164+
if (end === undefined) {
165+
end = this.length;
166+
}
187167
return this.alignment.originalBounds(start, end);
188168
}
189169

190170
/**
191171
* Map a span of tokens to the bounds of the corresponding original text.
192172
*/
193-
originalBounds(start: number, end: number): Bounds {
173+
originalBounds(start?: number, end?: number): Bounds {
194174
return this.text.alignment.originalBounds(this.textBounds(start, end));
195175
}
196176

js/tests/token.test.ts

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,39 +8,28 @@ import BiString, { Token, Tokenization, RegExpTokenizer, SplittingTokenizer } fr
88
test("Tokenization", () => {
99
let text = new BiString(" The quick, brown fox jumps over the lazy dog ");
1010
text = text.replace(",", "");
11+
text = text.replace(/^ +| +$/g, "");
1112

1213
let tokens = new Tokenization(text, [
13-
Token.slice(text, 2, 5),
14-
Token.slice(text, 6, 11),
15-
Token.slice(text, 12, 17),
16-
Token.slice(text, 18, 21),
17-
Token.slice(text, 22, 27),
18-
Token.slice(text, 28, 32),
19-
Token.slice(text, 33, 36),
20-
Token.slice(text, 37, 41),
21-
Token.slice(text, 42, 45),
14+
Token.slice(text, 0, 3),
15+
Token.slice(text, 4, 9),
16+
Token.slice(text, 10, 15),
17+
Token.slice(text, 16, 19),
18+
Token.slice(text, 20, 25),
19+
Token.slice(text, 26, 30),
20+
Token.slice(text, 31, 34),
21+
Token.slice(text, 35, 39),
22+
Token.slice(text, 40, 43),
2223
]);
2324
expect(tokens.text.equals(text)).toBe(true);
24-
expect(tokens.textBounds(1, 3)).toEqual([6, 17]);
25+
expect(tokens.textBounds(1, 3)).toEqual([4, 15]);
2526
expect(tokens.originalBounds(1, 3)).toEqual([6, 18]);
2627
expect(tokens.boundsForText(0, 13)).toEqual([0, 3]);
2728
expect(tokens.boundsForOriginal(0, 13)).toEqual([0, 2]);
28-
expect(tokens.sliceByText(36, 47).text.equals(new BiString("lazy dog"))).toBe(true);
29-
expect(tokens.sliceByOriginal(36, 48).text.equals(new BiString("the lazy dog"))).toBe(true);
30-
expect(tokens.snapTextBounds(1, 13)).toEqual([2, 17]);
29+
expect(tokens.sliceByText(34, 43).substring().equals(new BiString("lazy dog"))).toBe(true);
30+
expect(tokens.sliceByOriginal(36, 48).substring().equals(new BiString("the lazy dog"))).toBe(true);
31+
expect(tokens.snapTextBounds(2, 13)).toEqual([0, 15]);
3132
expect(tokens.snapOriginalBounds(36, 47)).toEqual([34, 46]);
32-
33-
tokens = tokens.slice(1, -1);
34-
expect(tokens.text.original).toBe("quick, brown fox jumps over the lazy");
35-
expect(tokens.text.modified).toBe("quick brown fox jumps over the lazy");
36-
expect(tokens.textBounds(1, 3)).toEqual([6, 15]);
37-
expect(tokens.originalBounds(1, 3)).toEqual([7, 16]);
38-
expect(tokens.boundsForText(8, 14)).toEqual([1, 3]);
39-
expect(tokens.boundsForOriginal(9, 15)).toEqual([1, 3]);
40-
expect(tokens.sliceByText(8, 14).text.equals(new BiString("brown fox"))).toBe(true);
41-
expect(tokens.sliceByOriginal(9, 15).text.equals(new BiString("brown fox"))).toBe(true);
42-
expect(tokens.snapTextBounds(8, 14)).toEqual([6, 15]);
43-
expect(tokens.snapOriginalBounds(9, 15)).toEqual([7, 16]);
4433
});
4534

4635
test("Tokenization.infer", () => {
@@ -60,7 +49,7 @@ test("RegExpTokenizer", () => {
6049
expect(tokens.text).toBe(text);
6150
expect(tokens.length).toBe(9);
6251
expect(tokens.textBounds(0, 2)).toEqual([1, 10]);
63-
expect(tokens.slice(0, 2).text.equals(text.slice(1, 10))).toBe(true);
52+
expect(tokens.slice(0, 2).substring().equals(text.slice(1, 10))).toBe(true);
6453
expect(tokens.sliceByText(5, 10).length).toBe(1);
6554
expect(tokens.sliceByText(5, 11).length).toBe(1);
6655
expect(tokens.sliceByText(3, 13).length).toBe(3);
@@ -75,7 +64,7 @@ test("SplittingTokenizer", () => {
7564
expect(tokens.text).toBe(text);
7665
expect(tokens.length).toBe(9);
7766
expect(tokens.textBounds(0, 2)).toEqual([1, 11]);
78-
expect(tokens.slice(0, 2).text.equals(text.slice(1, 11))).toBe(true);
67+
expect(tokens.slice(0, 2).substring().equals(text.slice(1, 11))).toBe(true);
7968
expect(tokens.sliceByText(5, 10).length).toBe(1);
8069
expect(tokens.sliceByText(5, 11).length).toBe(1);
8170
expect(tokens.sliceByText(3, 13).length).toBe(3);

python/bistring/_token.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -178,17 +178,25 @@ def __getitem__(self, index: int) -> Token: ...
178178
def __getitem__(self, index: slice) -> Tokenization: ...
179179

180180
def __getitem__(self, index: Index) -> Union[Token, Tokenization]:
181+
r"""
182+
Indexing a `Tokenization` returns the nth token:
183+
184+
>>> tokens = Tokenization.infer(
185+
... "The quick, brown fox",
186+
... ["The", "quick", "brown", "fox"],
187+
... )
188+
>>> tokens[0]
189+
Token(bistr('The'), start=0, end=3)
190+
191+
Slicing a `Tokenization` returns a new one with the requested slice of tokens:
192+
193+
>>> tokens = tokens[1:-1]
194+
>>> tokens[0]
195+
Token(bistr('quick'), start=4, end=9)
196+
"""
197+
181198
if isinstance(index, slice):
182-
start, stop, stride = index.indices(len(self))
183-
if stride != 1:
184-
raise ValueError('Non-unit strides not supported')
185-
186-
text = self.substring(start, stop)
187-
tokens = self._tokens[index]
188-
if tokens:
189-
delta = tokens[0].start
190-
tokens = [Token(t.text, t.start - delta, t.end - delta) for t in tokens]
191-
return Tokenization(text, tokens)
199+
return Tokenization(self.text, self._tokens[index])
192200
else:
193201
return self._tokens[index]
194202

@@ -201,19 +209,25 @@ def __repr__(self) -> str:
201209

202210
def substring(self, *args: AnyBounds) -> bistr:
203211
"""
204-
Map a span of tokens to the corresponding substring.
212+
Map a span of tokens to the corresponding substring. With no arguments, returns the substring from the first
213+
to the last token.
205214
"""
206-
return self.text[self.alignment.original_slice(*args)]
215+
i, j = self.text_bounds(*args)
216+
return self.text[i:j]
207217

208218
def text_bounds(self, *args: AnyBounds) -> Bounds:
209219
"""
210-
Map a span of tokens to the bounds of the corresponding text.
220+
Map a span of tokens to the bounds of the corresponding text. With no arguments, returns the bounds from the
221+
first to the last token.
211222
"""
223+
if len(args) == 0:
224+
args = (0, len(self))
212225
return self.alignment.original_bounds(*args)
213226

214227
def original_bounds(self, *args: AnyBounds) -> Bounds:
215228
"""
216-
Map a span of tokens to the bounds of the corresponding original text.
229+
Map a span of tokens to the bounds of the corresponding original text. With no arguments, returns the bounds from the
230+
first to the last token.
217231
"""
218232
return self.text.alignment.original_bounds(self.text_bounds(*args))
219233

python/tests/test_token.py

Lines changed: 17 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,40 +8,29 @@
88
def test_tokenization():
99
text = bistr(' The quick, brown fox jumps over the lazy dog ')
1010
text = text.replace(',', '')
11+
text = text.sub(r'^ +| +$', '')
1112

1213
tokens = Tokenization(text, [
13-
Token.slice(text, 2, 5),
14-
Token.slice(text, 6, 11),
15-
Token.slice(text, 12, 17),
16-
Token.slice(text, 18, 21),
17-
Token.slice(text, 22, 27),
18-
Token.slice(text, 28, 32),
19-
Token.slice(text, 33, 36),
20-
Token.slice(text, 37, 41),
21-
Token.slice(text, 42, 45),
14+
Token.slice(text, 0, 3),
15+
Token.slice(text, 4, 9),
16+
Token.slice(text, 10, 15),
17+
Token.slice(text, 16, 19),
18+
Token.slice(text, 20, 25),
19+
Token.slice(text, 26, 30),
20+
Token.slice(text, 31, 34),
21+
Token.slice(text, 35, 39),
22+
Token.slice(text, 40, 43),
2223
])
2324
assert tokens.text == text
24-
assert tokens.text_bounds(1, 3) == (6, 17)
25+
assert tokens.text_bounds(1, 3) == (4, 15)
2526
assert tokens.original_bounds(1, 3) == (6, 18)
2627
assert tokens.bounds_for_text(0, 13) == (0, 3)
2728
assert tokens.bounds_for_original(0, 13) == (0, 2)
28-
assert tokens.slice_by_text(36, 47).text == bistr('lazy dog')
29-
assert tokens.slice_by_original(36, 48).text == bistr('the lazy dog')
30-
assert tokens.snap_text_bounds(1, 13) == (2, 17)
29+
assert tokens.slice_by_text(34, 43).substring() == bistr('lazy dog')
30+
assert tokens.slice_by_original(36, 48).substring() == bistr('the lazy dog')
31+
assert tokens.snap_text_bounds(2, 13) == (0, 15)
3132
assert tokens.snap_original_bounds(36, 47) == (34, 46)
3233

33-
tokens = tokens[1:-1]
34-
assert tokens.text.original == 'quick, brown fox jumps over the lazy'
35-
assert tokens.text.modified == 'quick brown fox jumps over the lazy'
36-
assert tokens.text_bounds(1, 3) == (6, 15)
37-
assert tokens.original_bounds(1, 3) == (7, 16)
38-
assert tokens.bounds_for_text(8, 14) == (1, 3)
39-
assert tokens.bounds_for_original(9, 15) == (1, 3)
40-
assert tokens.slice_by_text(8, 14).text == bistr('brown fox')
41-
assert tokens.slice_by_original(9, 15).text == bistr('brown fox')
42-
assert tokens.snap_text_bounds(8, 14) == (6, 15)
43-
assert tokens.snap_original_bounds(9, 15) == (7, 16)
44-
4534

4635
def test_infer():
4736
text = 'the quick, brown fox'
@@ -65,7 +54,7 @@ def test_regex_tokenizer():
6554
assert tokens.text == text
6655
assert len(tokens) == 9
6756
assert tokens.text_bounds(0, 2) == (1, 10)
68-
assert tokens[0:2].text == text[1:10]
57+
assert tokens[0:2].substring() == text[1:10]
6958
assert len(tokens.slice_by_text(5, 10)) == 1
7059
assert len(tokens.slice_by_text(5, 11)) == 1
7160
assert len(tokens.slice_by_text(3, 13)) == 3
@@ -85,7 +74,7 @@ def test_splitting_tokenizer():
8574
assert tokens.text == text
8675
assert len(tokens) == 9
8776
assert tokens.text_bounds(0, 2) == (1, 11)
88-
assert tokens[0:2].text == text[1:11]
77+
assert tokens[0:2].substring() == text[1:11]
8978
assert len(tokens.slice_by_text(5, 10)) == 1
9079
assert len(tokens.slice_by_text(5, 11)) == 1
9180
assert len(tokens.slice_by_text(3, 13)) == 3
@@ -116,7 +105,7 @@ def test_word_tokenizer():
116105
assert tokens.text == text
117106
assert len(tokens) == 9
118107
assert tokens.text_bounds(0, 2) == (1, 10)
119-
assert tokens[0:2].text == text[1:10]
108+
assert tokens[0:2].substring() == text[1:10]
120109
assert len(tokens.slice_by_text(5, 10)) == 1
121110
assert len(tokens.slice_by_text(5, 11)) == 1
122111
assert len(tokens.slice_by_text(3, 13)) == 3

0 commit comments

Comments
 (0)