Merge pull request #18 from microsoft/tokenization-slice

Tavian Barnes · web-flow · commit a6f5377bd16b · 2019-09-06T09:42:35.000-04:00
token: Don't trim Tokenization text when slicing
diff --git a/js/src/token.ts b/js/src/token.ts
@@ -143,54 +143,34 @@ export class Tokenization {
      *         The requested slice as a new `Tokenization`.
      */
     slice(start?: number, end?: number): Tokenization {
-        if (start === undefined) {
-            return new Tokenization(this.text, this.tokens);
-        }
-        if (end === undefined) {
-            end = this.length;
-        }
-        if (start < 0) {
-            start += this.length;
-        }
-        if (end < 0) {
-            end += this.length;
-        }
-        if (end < start) {
-            end = start;
-        }
-
-        const substring = this.substring(start, end);
-        const tokens = this.tokens.slice(start, end);
-        if (tokens.length > 0) {
-            const delta = tokens[0].start;
-            for (const i in tokens) {
-                const token = tokens[i];
-                tokens[i] = new Token(token.text, token.start - delta, token.end - delta);
-            }
-        }
-
-        return new Tokenization(substring, tokens);
+        return new Tokenization(this.text, this.tokens.slice(start, end));
     }
 
     /**
      * Map a span of tokens to the corresponding substring.
      */
-    substring(start: number, end: number): BiString {
+    substring(start?: number, end?: number): BiString {
         const [first, last] = this.textBounds(start, end);
         return this.text.substring(first, last);
     }
 
     /**
      * Map a span of tokens to the bounds of the corresponding text.
      */
-    textBounds(start: number, end: number): Bounds {
+    textBounds(start?: number, end?: number): Bounds {
+        if (start === undefined) {
+            start = 0;
+        }
+        if (end === undefined) {
+            end = this.length;
+        }
         return this.alignment.originalBounds(start, end);
     }
 
     /**
      * Map a span of tokens to the bounds of the corresponding original text.
      */
-    originalBounds(start: number, end: number): Bounds {
+    originalBounds(start?: number, end?: number): Bounds {
         return this.text.alignment.originalBounds(this.textBounds(start, end));
     }
 
diff --git a/js/tests/token.test.ts b/js/tests/token.test.ts
@@ -8,39 +8,28 @@ import BiString, { Token, Tokenization, RegExpTokenizer, SplittingTokenizer } fr
 test("Tokenization", () => {
     let text = new BiString("  The quick, brown fox jumps over the lazy dog  ");
     text = text.replace(",", "");
+    text = text.replace(/^ +| +$/g, "");
 
     let tokens = new Tokenization(text, [
-        Token.slice(text, 2, 5),
-        Token.slice(text, 6, 11),
-        Token.slice(text, 12, 17),
-        Token.slice(text, 18, 21),
-        Token.slice(text, 22, 27),
-        Token.slice(text, 28, 32),
-        Token.slice(text, 33, 36),
-        Token.slice(text, 37, 41),
-        Token.slice(text, 42, 45),
+        Token.slice(text, 0, 3),
+        Token.slice(text, 4, 9),
+        Token.slice(text, 10, 15),
+        Token.slice(text, 16, 19),
+        Token.slice(text, 20, 25),
+        Token.slice(text, 26, 30),
+        Token.slice(text, 31, 34),
+        Token.slice(text, 35, 39),
+        Token.slice(text, 40, 43),
     ]);
     expect(tokens.text.equals(text)).toBe(true);
-    expect(tokens.textBounds(1, 3)).toEqual([6, 17]);
+    expect(tokens.textBounds(1, 3)).toEqual([4, 15]);
     expect(tokens.originalBounds(1, 3)).toEqual([6, 18]);
     expect(tokens.boundsForText(0, 13)).toEqual([0, 3]);
     expect(tokens.boundsForOriginal(0, 13)).toEqual([0, 2]);
-    expect(tokens.sliceByText(36, 47).text.equals(new BiString("lazy dog"))).toBe(true);
-    expect(tokens.sliceByOriginal(36, 48).text.equals(new BiString("the lazy dog"))).toBe(true);
-    expect(tokens.snapTextBounds(1, 13)).toEqual([2, 17]);
+    expect(tokens.sliceByText(34, 43).substring().equals(new BiString("lazy dog"))).toBe(true);
+    expect(tokens.sliceByOriginal(36, 48).substring().equals(new BiString("the lazy dog"))).toBe(true);
+    expect(tokens.snapTextBounds(2, 13)).toEqual([0, 15]);
     expect(tokens.snapOriginalBounds(36, 47)).toEqual([34, 46]);
-
-    tokens = tokens.slice(1, -1);
-    expect(tokens.text.original).toBe("quick, brown fox jumps over the lazy");
-    expect(tokens.text.modified).toBe("quick brown fox jumps over the lazy");
-    expect(tokens.textBounds(1, 3)).toEqual([6, 15]);
-    expect(tokens.originalBounds(1, 3)).toEqual([7, 16]);
-    expect(tokens.boundsForText(8, 14)).toEqual([1, 3]);
-    expect(tokens.boundsForOriginal(9, 15)).toEqual([1, 3]);
-    expect(tokens.sliceByText(8, 14).text.equals(new BiString("brown fox"))).toBe(true);
-    expect(tokens.sliceByOriginal(9, 15).text.equals(new BiString("brown fox"))).toBe(true);
-    expect(tokens.snapTextBounds(8, 14)).toEqual([6, 15]);
-    expect(tokens.snapOriginalBounds(9, 15)).toEqual([7, 16]);
 });
 
 test("Tokenization.infer", () => {
@@ -60,7 +49,7 @@ test("RegExpTokenizer", () => {
     expect(tokens.text).toBe(text);
     expect(tokens.length).toBe(9);
     expect(tokens.textBounds(0, 2)).toEqual([1, 10]);
-    expect(tokens.slice(0, 2).text.equals(text.slice(1, 10))).toBe(true);
+    expect(tokens.slice(0, 2).substring().equals(text.slice(1, 10))).toBe(true);
     expect(tokens.sliceByText(5, 10).length).toBe(1);
     expect(tokens.sliceByText(5, 11).length).toBe(1);
     expect(tokens.sliceByText(3, 13).length).toBe(3);
@@ -75,7 +64,7 @@ test("SplittingTokenizer", () => {
     expect(tokens.text).toBe(text);
     expect(tokens.length).toBe(9);
     expect(tokens.textBounds(0, 2)).toEqual([1, 11]);
-    expect(tokens.slice(0, 2).text.equals(text.slice(1, 11))).toBe(true);
+    expect(tokens.slice(0, 2).substring().equals(text.slice(1, 11))).toBe(true);
     expect(tokens.sliceByText(5, 10).length).toBe(1);
     expect(tokens.sliceByText(5, 11).length).toBe(1);
     expect(tokens.sliceByText(3, 13).length).toBe(3);
diff --git a/python/bistring/_token.py b/python/bistring/_token.py
@@ -178,17 +178,25 @@ def __getitem__(self, index: int) -> Token: ...
     def __getitem__(self, index: slice) -> Tokenization: ...
 
     def __getitem__(self, index: Index) -> Union[Token, Tokenization]:
+        r"""
+        Indexing a `Tokenization` returns the nth token:
+
+            >>> tokens = Tokenization.infer(
+            ...     "The quick, brown fox",
+            ...     ["The", "quick", "brown", "fox"],
+            ... )
+            >>> tokens[0]
+            Token(bistr('The'), start=0, end=3)
+
+        Slicing a `Tokenization` returns a new one with the requested slice of tokens:
+
+            >>> tokens = tokens[1:-1]
+            >>> tokens[0]
+            Token(bistr('quick'), start=4, end=9)
+        """
+
         if isinstance(index, slice):
-            start, stop, stride = index.indices(len(self))
-            if stride != 1:
-                raise ValueError('Non-unit strides not supported')
-
-            text = self.substring(start, stop)
-            tokens = self._tokens[index]
-            if tokens:
-                delta = tokens[0].start
-                tokens = [Token(t.text, t.start - delta, t.end - delta) for t in tokens]
-            return Tokenization(text, tokens)
+            return Tokenization(self.text, self._tokens[index])
         else:
             return self._tokens[index]
 
@@ -201,19 +209,25 @@ def __repr__(self) -> str:
 
     def substring(self, *args: AnyBounds) -> bistr:
         """
-        Map a span of tokens to the corresponding substring.
+        Map a span of tokens to the corresponding substring.  With no arguments, returns the substring from the first
+        to the last token.
         """
-        return self.text[self.alignment.original_slice(*args)]
+        i, j = self.text_bounds(*args)
+        return self.text[i:j]
 
     def text_bounds(self, *args: AnyBounds) -> Bounds:
         """
-        Map a span of tokens to the bounds of the corresponding text.
+        Map a span of tokens to the bounds of the corresponding text.  With no arguments, returns the bounds from the
+        first to the last token.
         """
+        if len(args) == 0:
+            args = (0, len(self))
         return self.alignment.original_bounds(*args)
 
     def original_bounds(self, *args: AnyBounds) -> Bounds:
         """
-        Map a span of tokens to the bounds of the corresponding original text.
+        Map a span of tokens to the bounds of the corresponding original text.  With no arguments, returns the bounds from the
+        first to the last token.
         """
         return self.text.alignment.original_bounds(self.text_bounds(*args))
 
diff --git a/python/tests/test_token.py b/python/tests/test_token.py
@@ -8,40 +8,29 @@
 def test_tokenization():
     text = bistr('  The quick, brown fox jumps over the lazy dog  ')
     text = text.replace(',', '')
+    text = text.sub(r'^ +| +$', '')
 
     tokens = Tokenization(text, [
-        Token.slice(text, 2, 5),
-        Token.slice(text, 6, 11),
-        Token.slice(text, 12, 17),
-        Token.slice(text, 18, 21),
-        Token.slice(text, 22, 27),
-        Token.slice(text, 28, 32),
-        Token.slice(text, 33, 36),
-        Token.slice(text, 37, 41),
-        Token.slice(text, 42, 45),
+        Token.slice(text, 0, 3),
+        Token.slice(text, 4, 9),
+        Token.slice(text, 10, 15),
+        Token.slice(text, 16, 19),
+        Token.slice(text, 20, 25),
+        Token.slice(text, 26, 30),
+        Token.slice(text, 31, 34),
+        Token.slice(text, 35, 39),
+        Token.slice(text, 40, 43),
     ])
     assert tokens.text == text
-    assert tokens.text_bounds(1, 3) == (6, 17)
+    assert tokens.text_bounds(1, 3) == (4, 15)
     assert tokens.original_bounds(1, 3) == (6, 18)
     assert tokens.bounds_for_text(0, 13) == (0, 3)
     assert tokens.bounds_for_original(0, 13) == (0, 2)
-    assert tokens.slice_by_text(36, 47).text == bistr('lazy dog')
-    assert tokens.slice_by_original(36, 48).text == bistr('the lazy dog')
-    assert tokens.snap_text_bounds(1, 13) == (2, 17)
+    assert tokens.slice_by_text(34, 43).substring() == bistr('lazy dog')
+    assert tokens.slice_by_original(36, 48).substring() == bistr('the lazy dog')
+    assert tokens.snap_text_bounds(2, 13) == (0, 15)
     assert tokens.snap_original_bounds(36, 47) == (34, 46)
 
-    tokens = tokens[1:-1]
-    assert tokens.text.original == 'quick, brown fox jumps over the lazy'
-    assert tokens.text.modified == 'quick brown fox jumps over the lazy'
-    assert tokens.text_bounds(1, 3) == (6, 15)
-    assert tokens.original_bounds(1, 3) == (7, 16)
-    assert tokens.bounds_for_text(8, 14) == (1, 3)
-    assert tokens.bounds_for_original(9, 15) == (1, 3)
-    assert tokens.slice_by_text(8, 14).text == bistr('brown fox')
-    assert tokens.slice_by_original(9, 15).text == bistr('brown fox')
-    assert tokens.snap_text_bounds(8, 14) == (6, 15)
-    assert tokens.snap_original_bounds(9, 15) == (7, 16)
-
 
 def test_infer():
     text = 'the quick, brown fox'
@@ -65,7 +54,7 @@ def test_regex_tokenizer():
     assert tokens.text == text
     assert len(tokens) == 9
     assert tokens.text_bounds(0, 2) == (1, 10)
-    assert tokens[0:2].text == text[1:10]
+    assert tokens[0:2].substring() == text[1:10]
     assert len(tokens.slice_by_text(5, 10)) == 1
     assert len(tokens.slice_by_text(5, 11)) == 1
     assert len(tokens.slice_by_text(3, 13)) == 3
@@ -85,7 +74,7 @@ def test_splitting_tokenizer():
     assert tokens.text == text
     assert len(tokens) == 9
     assert tokens.text_bounds(0, 2) == (1, 11)
-    assert tokens[0:2].text == text[1:11]
+    assert tokens[0:2].substring() == text[1:11]
     assert len(tokens.slice_by_text(5, 10)) == 1
     assert len(tokens.slice_by_text(5, 11)) == 1
     assert len(tokens.slice_by_text(3, 13)) == 3
@@ -116,7 +105,7 @@ def test_word_tokenizer():
     assert tokens.text == text
     assert len(tokens) == 9
     assert tokens.text_bounds(0, 2) == (1, 10)
-    assert tokens[0:2].text == text[1:10]
+    assert tokens[0:2].substring() == text[1:10]
     assert len(tokens.slice_by_text(5, 10)) == 1
     assert len(tokens.slice_by_text(5, 11)) == 1
     assert len(tokens.slice_by_text(3, 13)) == 3