Skip to content

Commit 5262030

Browse files
committed
support for regex filter transforms via the ~/regex/ syntax
1 parent 49a824f commit 5262030

File tree

17 files changed

+388
-169
lines changed

17 files changed

+388
-169
lines changed

tessellate-main/src/main/antora/modules/reference/pages/transforms.adoc

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Data files, or objects, have paths and names.
1111
Field values can be parsed from the paths and embedded in the tuple stream as fields.
1212
This is common when data has been partitioned into files where common values (like month and/or day) can be embedded in the path name to help select relevant files (push down predicates are applied to path values by many query engines).
1313

14-
Declared fields in a pipeline have the following format: `<field_name>|<field_type>`, where `<field_name>` is a string, or an ordinal (number representing the position).
14+
Declared fields in a pipeline have the following format:
15+
`<field_name>|<field_type>`, where `<field_name>` is a string, or an ordinal (number representing the position).
1516
The `<field_name>` may be quoted by a single quote (`'`) .
1617

1718
`<field_type>` is optional, depending on the use. `<field_type>` further may be formatted as `<type>|<metadata>`.
@@ -20,8 +21,27 @@ The actual supported types and associated metadata are described in xref:types.a
2021

2122
== Transforms
2223

23-
Transforms manipulate the tuple stream.
24-
They are applied to every tuple in the tuple stream.
24+
Transforms manipulate the tuple stream either by removing (filtering) a given tuple from the stream or by changing the values in any given tuple in the stream.
25+
26+
=== Filters
27+
28+
A tuple can be retained in a stream if a given predicate expression returns true for any the filter's arguments.
29+
30+
==== Expressions
31+
32+
If the expression is a regular expression, the expression will be matched with every argument individually after being coerced into a String.
33+
If any value (after coercion) is `null`, an empty string ("") will be passed to the regular expression matcher.
34+
35+
All values must match to retain the tuple.
36+
37+
One argument:: `from_field1 ~/expression/`
38+
Many arguments:: `from_field1 + from_field2 + from_fieldN ~/expression/`
39+
40+
Note there is no operator after the `~/expression/`, this indicates the statement is a filter.
41+
42+
Note a `/` can be escaped with `//` in any expression.
43+
44+
=== Operators
2545

2646
Insert literal:: Insert a literal value into a field.
2747
Coerce field:: Transform a field, in every tuple.
@@ -30,8 +50,6 @@ Rename field:: Rename a field, optionally coercing its type.
3050
Discard field:: Remove a field.
3151
Apply function:: Apply intrinsic functions against one or more fields.
3252

33-
=== Operators
34-
3553
There are three transform operators:
3654

3755
`pass:[=>]`:: Assign a literal value to a new field.

tessellate-main/src/main/java/io/clusterless/tessellate/factory/TapFactories.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ public static SourceFactory findSourceFactory(List<URI> uris, Format format, Com
9999

100100
public static SinkFactory findSinkFactory(Sink sinkModel) {
101101
List<URI> inputUris = sinkModel.uris();
102+
103+
if (inputUris.isEmpty()) {
104+
return null;
105+
}
106+
102107
Format format = sinkModel.schema().format();
103108
Compression compression = sinkModel.schema().compression();
104109
SinkFactory sinkFactory = findSinkFactory(inputUris, format, compression);

tessellate-main/src/main/java/io/clusterless/tessellate/parser/StatementParser.java

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,25 @@ private static Parser<Void> op(String op) {
8080
(Map::entry)
8181
);
8282

83+
// Accepts any Java regex, where '/' is escaped as '//'
84+
private static final Parser<RegExp> REGEX = Parsers.sequence(
85+
Scanners.string("~/"),
86+
Parsers.or(
87+
Scanners.string("//"), // the retn() method doesn't work here
88+
Scanners.notChar('/')
89+
)
90+
.many().source(),
91+
Scanners.isChar('/'),
92+
(unused, regex, unused2) -> new RegExp(regex.replace("//", "/"))
93+
);
94+
95+
private static final Parser<FilterStatement> REGEX_FILTER =
96+
Parsers.sequence(
97+
FieldParser.FIELD_LIST.followedBy(Scanners.many(IS_WHITESPACE)),
98+
REGEX.followedBy(EOF),
99+
FilterStatement::new
100+
);
101+
83102
private static final Parser<Map<String, String>> PARAMS =
84103
PARAM_ENTRY.sepBy(PARAM_DELIM).map(l -> l.stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
85104

@@ -148,18 +167,18 @@ private static Parser<Void> op(String op) {
148167
UnaryOperation::new
149168
);
150169

151-
public static Parser<Assignment> LITERAL_ASSIGNMENT =
170+
public static Parser<AssignmentStatement> LITERAL_ASSIGNMENT =
152171
Parsers.or(
153172
Parsers.sequence(
154173
LITERAL_VALUE,
155174
ASSIGNMENT.followedBy(Scanners.many(IS_WHITESPACE)),
156175
FieldParser.fullFieldDeclaration.followedBy(EOF),
157-
Assignment::new
176+
AssignmentStatement::new
158177
),
159178
Parsers.sequence(
160179
ASSIGNMENT.followedBy(Scanners.many(IS_WHITESPACE)),
161180
FieldParser.fullFieldDeclaration.followedBy(EOF),
162-
Assignment::new
181+
AssignmentStatement::new
163182
)
164183
);
165184

@@ -192,22 +211,23 @@ private static Parser<Void> op(String op) {
192211
(type, unused) -> type
193212
);
194213

195-
public static Parser<Join> JOIN_STATEMENT =
214+
public static Parser<JoinStatement> JOIN_STATEMENT =
196215
Parsers.or(
197216
Parsers.sequence(
198217
FieldParser.RELATION_LIST.followedBy(Scanners.many(IS_WHITESPACE)),
199218
JOIN.followedBy(Scanners.many(IS_WHITESPACE)),
200219
Parsers.or(RETAIN, DISCARD).followedBy(Scanners.many(IS_WHITESPACE)),
201220
FieldParser.FIELD_LIST.followedBy(EOF),
202-
Join::new
221+
JoinStatement::new
203222
),
204223
Parsers.sequence(
205224
FieldParser.RELATION_LIST.followedBy(Scanners.many(IS_WHITESPACE)),
206225
JOIN.followedBy(EOF),
207-
Join::new
226+
JoinStatement::new
208227
));
209228

210229
public static Parser<Statement> STATEMENTS = Parsers.or(
230+
REGEX_FILTER,
211231
LITERAL_ASSIGNMENT,
212232
TRANSFORM_COERCE,
213233
TRANSFORM_DISCARD,

tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/Assignment.java renamed to tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/AssignmentStatement.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@
1111
import com.google.common.base.Joiner;
1212
import io.clusterless.tessellate.parser.Printer;
1313

14-
public class Assignment implements Statement {
14+
public class AssignmentStatement implements Statement {
1515
String literal;
1616
Op op;
1717
Field result;
1818

19-
public Assignment(String literal, Op op, Field result) {
19+
public AssignmentStatement(String literal, Op op, Field result) {
2020
this.literal = literal;
2121
this.op = op;
2222
this.result = result;
2323
}
2424

25-
public Assignment(Op op, Field result) {
25+
public AssignmentStatement(Op op, Field result) {
2626
this.literal = null;
2727
this.op = op;
2828
this.result = result;
@@ -42,8 +42,8 @@ public Field result() {
4242

4343
@Override
4444
public String toString() {
45-
return Joiner.on("")
46-
.useForNull("")
45+
return Joiner.on(" ")
46+
.skipNulls()
4747
.join(
4848
Printer.literal(literal()),
4949
op(),
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright (c) 2023-2025 Chris K Wensel <[email protected]>. All Rights Reserved.
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
9+
package io.clusterless.tessellate.parser.ast;
10+
11+
import com.google.common.base.Joiner;
12+
import io.clusterless.tessellate.parser.Printer;
13+
14+
import java.util.List;
15+
16+
public class FilterStatement implements Statement {
17+
List<Field> arguments;
18+
Exp exp;
19+
20+
public FilterStatement(List<Field> arguments, Exp exp) {
21+
this.arguments = arguments;
22+
this.exp = exp;
23+
}
24+
25+
public List<Field> arguments() {
26+
return arguments;
27+
}
28+
29+
public <T extends Exp> T exp() {
30+
return (T) exp;
31+
}
32+
33+
@Override
34+
public Op op() {
35+
return null;
36+
}
37+
38+
@Override
39+
public String toString() {
40+
return Joiner.on(" ")
41+
.skipNulls()
42+
.join(
43+
Printer.fields(arguments),
44+
exp()
45+
);
46+
}
47+
}

tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/Join.java renamed to tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/JoinStatement.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,20 @@
1010

1111
import java.util.List;
1212

13-
public class Join implements Statement {
13+
public class JoinStatement implements Statement {
1414
private final List<Rel> relations;
1515
private final JoinType joinType;
1616
private final Op op;
1717
private final List<Field> results;
1818

19-
public Join(List<Rel> relations, JoinType joinType) {
19+
public JoinStatement(List<Rel> relations, JoinType joinType) {
2020
this.relations = relations;
2121
this.joinType = joinType;
2222
this.op = new Op();
2323
this.results = List.of();
2424
}
2525

26-
public Join(List<Rel> relations, JoinType joinType, Op op, List<Field> results) {
26+
public JoinStatement(List<Rel> relations, JoinType joinType, Op op, List<Field> results) {
2727
this.relations = relations;
2828
this.joinType = joinType;
2929
this.op = op;

tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/Operation.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
public class Operation implements Statement {
1818
List<Field> arguments = Collections.emptyList();
1919
Exp exp;
20-
Op op = new Op();
20+
Op op;
2121
List<Field> results = Collections.emptyList();
2222

2323
public Operation(List<Field> arguments, Exp exp, Op op, List<Field> results) {
@@ -66,13 +66,13 @@ public List<Field> results() {
6666

6767
@Override
6868
public String toString() {
69-
return Joiner.on("")
70-
.useForNull("")
69+
return Joiner.on(" ")
70+
.skipNulls()
7171
.join(
72-
Printer.fields(arguments),
72+
arguments.isEmpty() ? null : Printer.fields(arguments),
7373
exp(),
7474
op(),
75-
Printer.fields(results)
75+
results.isEmpty() ? null : Printer.fields(results)
7676
);
7777
}
7878
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (c) 2023-2025 Chris K Wensel <[email protected]>. All Rights Reserved.
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
9+
package io.clusterless.tessellate.parser.ast;
10+
11+
public class RegExp implements Exp {
12+
String pattern;
13+
14+
public RegExp(String pattern) {
15+
this.pattern = pattern;
16+
}
17+
18+
public String pattern() {
19+
return pattern;
20+
}
21+
22+
@Override
23+
public String toString() {
24+
return "~/" + pattern + "/";
25+
}
26+
}

tessellate-main/src/main/java/io/clusterless/tessellate/parser/ast/Statement.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,19 @@
1111
public interface Statement {
1212
Op op();
1313

14+
default String opString() {
15+
if (op() == null) {
16+
return "";
17+
}
18+
19+
return op().op();
20+
}
21+
1422
default boolean isJoin() {
15-
return this instanceof Join;
23+
return this instanceof JoinStatement;
24+
}
25+
26+
default boolean isFilter() {
27+
return this instanceof FilterStatement;
1628
}
1729
}

tessellate-main/src/main/java/io/clusterless/tessellate/pipeline/Pipeline.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
import io.clusterless.tessellate.model.*;
3030
import io.clusterless.tessellate.options.PipelineOptions;
3131
import io.clusterless.tessellate.options.PrintOptions;
32-
import io.clusterless.tessellate.parser.ast.Join;
32+
import io.clusterless.tessellate.parser.ast.JoinStatement;
3333
import io.clusterless.tessellate.parser.ast.Rel;
3434
import io.clusterless.tessellate.parser.ast.Statement;
3535
import io.clusterless.tessellate.printer.SchemaPrinter;
@@ -288,14 +288,14 @@ private Source findPrimarySource() {
288288
return source;
289289
}
290290

291-
List<Join> joins = pipelineDef.transform().statements(Join.class);
291+
List<JoinStatement> joins = pipelineDef.transform().statements(JoinStatement.class);
292292

293293
if (joins.isEmpty()) {
294294
throw new IllegalStateException("no source defined");
295295
}
296296

297297
List<String> names = joins.stream()
298-
.map(Join::rhsRelations)
298+
.map(JoinStatement::rhsRelations)
299299
.flatMap(List::stream)
300300
.map(Rel::name)
301301
.distinct()
@@ -309,10 +309,10 @@ private Source findPrimarySource() {
309309
}
310310

311311
private Map<String, Source> findSecondarySources() {
312-
List<Join> joins = pipelineDef.transform().statements(Join.class);
312+
List<JoinStatement> joins = pipelineDef.transform().statements(JoinStatement.class);
313313

314314
List<String> names = joins.stream()
315-
.map(Join::lhsRelations)
315+
.map(JoinStatement::lhsRelations)
316316
.flatMap(List::stream)
317317
.map(Rel::name)
318318
.distinct()

0 commit comments

Comments
 (0)