diff --git a/initialization.go b/initialization.go index ff995d0..24b40d3 100644 --- a/initialization.go +++ b/initialization.go @@ -16,12 +16,13 @@ package regex import ( _ "embed" + "github.com/tetratelabs/wazero" ) // Embedded data that will be loaded into our WASM runtime var ( //go:embed icu/wasm/icu.wasm - icuWasm []byte // This is generated using the "build.sh" script in the "icu" folder + icuWasm []byte // This is generated using the "build.sh" script in the "icu" folder icuConfig = wazero.NewModuleConfig() ) diff --git a/pool.go b/pool.go index eadee47..b85a977 100644 --- a/pool.go +++ b/pool.go @@ -16,12 +16,13 @@ package regex import ( "context" - "github.com/tetratelabs/wazero" - "github.com/tetratelabs/wazero/api" - "github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1" "reflect" "runtime" "sync" + + "github.com/tetratelabs/wazero" + "github.com/tetratelabs/wazero/api" + "github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1" ) // modulePool is the pool that is used internally by the project. diff --git a/regex.go b/regex.go index 1145df9..77b8219 100644 --- a/regex.go +++ b/regex.go @@ -33,12 +33,19 @@ type Regex interface { // SetMatchString sets the string that we will either be matching against, or executing the replacements on. This // must be called after SetRegexString, but before any other calls. SetMatchString(ctx context.Context, matchStr string) error + // IndexOf returns the index of the previously-set regex matching the previously-set match string. Must call + // SetRegexString and SetMatchString before this function. `endIndex` determines whether the returned index is at + // the beginning or end of the match. `start` and `occurrence` start at 1, not 0. Returns 0 if the index was not found. + IndexOf(ctx context.Context, start int, occurrence int, endIndex bool) (int, error) // Matches returns whether the previously-set regex matches the previously-set match string. Must call // SetRegexString and SetMatchString before this function. Matches(ctx context.Context, start int, occurrence int) (bool, error) // Replace returns a new string with the replacement string occupying the matched portions of the match string, // based on the regex. Position starts at 1, not 0. Must call SetRegexString and SetMatchString before this function. Replace(ctx context.Context, replacementStr string, position int, occurrence int) (string, error) + // Substring returns the match of the previously-set match string, using the previously-set regex. Must call + // SetRegexString and SetMatchString before this function. `start` and `occurrence` start at 1, not 0. + Substring(ctx context.Context, start int, occurrence int) (string, bool, error) // StringBufferSize returns the size of the string buffers, in bytes. If the string buffer is not being used, then // this returns zero. StringBufferSize() uint32 @@ -280,6 +287,56 @@ func (pr *privateRegex) SetMatchString(ctx context.Context, matchStr string) (er return nil } +// IndexOf implements the interface Regex. +func (pr *privateRegex) IndexOf(ctx context.Context, start int, occurrence int, endIndex bool) (int, error) { + // Check for the regex pointer first + if pr.regexPtr == 0 { + return 0, ErrRegexNotYetSet.New() + } + + // Check that the match string has been set + if pr.matchStrUPtr == 0 { + return 0, ErrMatchNotYetSet.New() + } + + // Look for a match + var errorCode UErrorCode + ok, err := pr.uregex_find(ctx, pr.regexPtr, start-1, &errorCode) + if err != nil { + return 0, err + } + for i := 1; i < occurrence && ok; i++ { + ok, err = pr.uregex_findNext(ctx, pr.regexPtr, &errorCode) + if err != nil { + return 0, err + } + } + if !ok { + return 0, nil + } + + // Get the index of the match + var index int + if endIndex { + index32, err := pr.uregex_end(ctx, pr.regexPtr, 0, &errorCode) + if err != nil { + return 0, err + } + index = int(index32) + } else { + index32, err := pr.uregex_start(ctx, pr.regexPtr, 0, &errorCode) + if err != nil { + return 0, err + } + index = int(index32) + } + if errorCode > 0 { + return 0, fmt.Errorf("unexpected UErrorCode from uregex_find/uregex_findNext: %d", errorCode) + } + + return index + 1, nil +} + // Matches implements the interface Regex. func (pr *privateRegex) Matches(ctx context.Context, start int, occurrence int) (ok bool, err error) { // Check for the regex pointer first @@ -354,6 +411,54 @@ func (pr *privateRegex) Replace(ctx context.Context, replacementStr string, star return fromUTF16(returnStrBytes), nil } +// Substring implements the interface Regex. +func (pr *privateRegex) Substring(ctx context.Context, start int, occurrence int) (string, bool, error) { + // Check for the regex pointer first + if pr.regexPtr == 0 { + return "", false, ErrRegexNotYetSet.New() + } + + // Check that the match string has been set + if pr.matchStrUPtr == 0 { + return "", false, ErrMatchNotYetSet.New() + } + + // Look for a match + var errorCode UErrorCode + ok, err := pr.uregex_find(ctx, pr.regexPtr, start-1, &errorCode) + if err != nil { + return "", false, err + } + for i := 1; i < occurrence && ok; i++ { + ok, err = pr.uregex_findNext(ctx, pr.regexPtr, &errorCode) + if err != nil { + return "", false, err + } + } + if !ok { + return "", false, nil + } + + // Get the bounds of the match + idxStart, err := pr.uregex_start(ctx, pr.regexPtr, 0, &errorCode) + if err != nil { + return "", false, err + } + idxEnd, err := pr.uregex_end(ctx, pr.regexPtr, 0, &errorCode) + if err != nil { + return "", false, err + } + if errorCode > 0 { + return "", false, fmt.Errorf("unexpected UErrorCode from uregex_find/uregex_findNext: %d", errorCode) + } + + returnStrBytes, ok := pr.mod.Memory().Read(uint32(pr.matchStrUPtr)+uint32(idxStart*2), uint32((idxEnd-idxStart)*2)) + if !ok { + return "", false, fmt.Errorf("somehow failed when retrieving the substring") + } + return fromUTF16(returnStrBytes), true, nil +} + // StringBufferSize implements the interface Regex. func (pr *privateRegex) StringBufferSize() uint32 { return pr.bufferSize diff --git a/regex_test.go b/regex_test.go index e62c4f5..60ce1f1 100644 --- a/regex_test.go +++ b/regex_test.go @@ -97,3 +97,93 @@ func TestRegexReplace(t *testing.T) { require.Equal(t, "X X X", replacedStr) require.NoError(t, regex.Close()) } + +func TestRegexIndexOf(t *testing.T) { + ctx := context.Background() + regex := CreateRegex(1024) + require.NoError(t, regex.SetRegexString(ctx, `[a-j]+`, RegexFlags_None)) + err := regex.SetMatchString(ctx, "abc def ghi") + require.NoError(t, err) + idx, err := regex.IndexOf(ctx, 1, 1, false) + require.NoError(t, err) + require.Equal(t, 1, idx) + idx, err = regex.IndexOf(ctx, 4, 1, false) + require.NoError(t, err) + require.Equal(t, 5, idx) + idx, err = regex.IndexOf(ctx, 8, 1, false) + require.NoError(t, err) + require.Equal(t, 9, idx) + idx, err = regex.IndexOf(ctx, 1, 2, false) + require.NoError(t, err) + require.Equal(t, 5, idx) + idx, err = regex.IndexOf(ctx, 1, 3, false) + require.NoError(t, err) + require.Equal(t, 9, idx) + idx, err = regex.IndexOf(ctx, 1, 4, false) + require.NoError(t, err) + require.Equal(t, 0, idx) + idx, err = regex.IndexOf(ctx, 1, 1, true) + require.NoError(t, err) + require.Equal(t, 4, idx) + idx, err = regex.IndexOf(ctx, 4, 1, true) + require.NoError(t, err) + require.Equal(t, 8, idx) + idx, err = regex.IndexOf(ctx, 8, 1, true) + require.NoError(t, err) + require.Equal(t, 12, idx) + idx, err = regex.IndexOf(ctx, 1, 2, true) + require.NoError(t, err) + require.Equal(t, 8, idx) + idx, err = regex.IndexOf(ctx, 1, 3, true) + require.NoError(t, err) + require.Equal(t, 12, idx) + idx, err = regex.IndexOf(ctx, 1, 4, true) + require.NoError(t, err) + require.Equal(t, 0, idx) + require.NoError(t, regex.SetMatchString(ctx, "klmno fghij abcde")) + idx, err = regex.IndexOf(ctx, 1, 1, false) + require.NoError(t, err) + require.Equal(t, 7, idx) + idx, err = regex.IndexOf(ctx, 1, 1, true) + require.NoError(t, err) + require.Equal(t, 12, idx) + require.NoError(t, regex.Close()) +} + +func TestRegexSubstring(t *testing.T) { + ctx := context.Background() + regex := CreateRegex(1024) + require.NoError(t, regex.SetRegexString(ctx, `[a-z]+`, RegexFlags_None)) + err := regex.SetMatchString(ctx, "abc def ghi") + require.NoError(t, err) + substr, ok, err := regex.Substring(ctx, 1, 1) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "abc", substr) + substr, ok, err = regex.Substring(ctx, 4, 1) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "def", substr) + substr, ok, err = regex.Substring(ctx, 8, 1) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "ghi", substr) + substr, ok, err = regex.Substring(ctx, 1, 2) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "def", substr) + substr, ok, err = regex.Substring(ctx, 1, 3) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "ghi", substr) + substr, ok, err = regex.Substring(ctx, 1, 4) + require.NoError(t, err) + require.False(t, ok) + require.Equal(t, "", substr) + require.NoError(t, regex.SetMatchString(ctx, "ghx dey abz")) + substr, ok, err = regex.Substring(ctx, 1, 1) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, "ghx", substr) + require.NoError(t, regex.Close()) +}