Skip to content

Commit 2152cd4

Browse files
committed
add new trimToNull intrinsic function
this will convert any empty strings or strings with only whitespace to a null value
1 parent a1d10c5 commit 2152cd4

File tree

5 files changed

+145
-0
lines changed

5 files changed

+145
-0
lines changed

tessellate-main/src/main/antora/modules/reference/pages/transforms.adoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,7 @@ Def:::
145145
`sourcePath`:: Add the URI of the data currently being processed.
146146
Def:::
147147
- `^sourcePath{} -> to_field` - Assign the current URI to `to_field`
148+
149+
`trimToNull`:: Convert all arguments to `null` if the string representation of the value is an empty string or only contains whitespace.
150+
Def:::
151+
- `from_field1 + fromField2 ^trimToNull{} ->` - Convert any whitespace values to null while retaining the field names
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright (c) 2023-2025 Chris K Wensel <[email protected]>. All Rights Reserved.
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
9+
package io.clusterless.tessellate.operation;
10+
11+
import cascading.flow.FlowProcess;
12+
import cascading.operation.BaseOperation;
13+
import cascading.operation.Function;
14+
import cascading.operation.FunctionCall;
15+
import cascading.operation.OperationCall;
16+
import cascading.tuple.Fields;
17+
import cascading.tuple.Tuple;
18+
import cascading.tuple.TupleEntry;
19+
20+
public class TrimToNullFunction extends BaseOperation<Object[]> implements Function<Object[]> {
21+
22+
public TrimToNullFunction(Fields fieldDeclaration) {
23+
super(fieldDeclaration);
24+
}
25+
26+
@Override
27+
public void prepare(FlowProcess flowProcess, OperationCall<Object[]> operationCall) {
28+
Object[] context = new Object[]{
29+
new TupleEntry(fieldDeclaration, Tuple.size(fieldDeclaration.size())),
30+
new Object[fieldDeclaration.size()]
31+
};
32+
operationCall.setContext(context);
33+
}
34+
35+
@Override
36+
public void operate(FlowProcess flowProcess, FunctionCall<Object[]> functionCall) {
37+
Object[] context = functionCall.getContext();
38+
TupleEntry result = (TupleEntry) context[0];
39+
Object[] values = (Object[]) context[1];
40+
41+
TupleEntry arguments = functionCall.getArguments();
42+
for (int i = 0; i < arguments.size(); i++) {
43+
String value = arguments.getString(i);
44+
45+
if (value == null) {
46+
values[i] = null;
47+
} else {
48+
String intermediate = value.trim();
49+
50+
if (intermediate.isEmpty()) {
51+
values[i] = null;
52+
} else {
53+
values[i] = arguments.getObject(i);
54+
}
55+
}
56+
}
57+
58+
result.setCanonicalValues(values);
59+
60+
functionCall.getOutputCollector().add(result);
61+
}
62+
}

tessellate-main/src/main/java/io/clusterless/tessellate/pipeline/Intrinsics.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ private static void add(IntrinsicBuilder intrinsicBuilder) {
2323

2424
static {
2525
add(new TsidIntrinsic());
26+
add(new TrimToNullIntrinsic());
2627
add(new SourcePathIntrinsic());
2728
add(new FixedWidthIntrinsic());
2829
add(new EnsureIntrinsic());
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright (c) 2023-2025 Chris K Wensel <[email protected]>. All Rights Reserved.
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
9+
package io.clusterless.tessellate.pipeline.intrinsic;
10+
11+
import cascading.tuple.Fields;
12+
import io.clusterless.tessellate.operation.TrimToNullFunction;
13+
import io.clusterless.tessellate.parser.ast.Operation;
14+
15+
public class TrimToNullIntrinsic extends IntrinsicBuilder {
16+
17+
public TrimToNullIntrinsic() {
18+
super("trimToNull");
19+
}
20+
21+
@Override
22+
public Result create(Fields currentFields, Operation operation) {
23+
Fields fromFields = fieldsParser().asFields(operation.arguments());
24+
25+
// doesn't make sense to make an empty json object
26+
if (fromFields.isNone()) {
27+
fromFields = Fields.ALL;
28+
}
29+
30+
Fields toFields = fieldsParser().asFields(operation.results());
31+
32+
if (toFields.isNone() && fromFields.isAll()) {
33+
toFields = currentFields;
34+
} else if (toFields.isNone()) {
35+
toFields = currentFields.select(fromFields);
36+
}
37+
38+
TrimToNullFunction function = new TrimToNullFunction(toFields);
39+
40+
return new Result(fromFields, function, toFields);
41+
}
42+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright (c) 2023-2025 Chris K Wensel <[email protected]>. All Rights Reserved.
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
9+
package io.clusterless.tessellate.operation;
10+
11+
import cascading.CascadingTesting;
12+
import cascading.tuple.Fields;
13+
import cascading.tuple.Tuple;
14+
import cascading.tuple.TupleEntry;
15+
import cascading.tuple.TupleListCollector;
16+
import org.junit.jupiter.api.Assertions;
17+
import org.junit.jupiter.api.Test;
18+
19+
public class TrimToNullTest {
20+
@Test
21+
void test() {
22+
verify(new Tuple(null, null, null, null, null), new Tuple(null, null, null, null, null));
23+
verify(new Tuple("a", "", "\n\t", null, null), new Tuple("a", null, null, null, null));
24+
}
25+
26+
private static void verify(Tuple arguments, Tuple results) {
27+
TrimToNullFunction function = new TrimToNullFunction(Fields.size(results.size()));
28+
29+
try (TupleListCollector tuples = CascadingTesting.invokeFunction(function, arguments, Fields.RESULTS)) {
30+
TupleEntry next = tuples.entryIterator().next();
31+
32+
Assertions.assertEquals(results.size(), next.size());
33+
Assertions.assertEquals(results, next.getTuple());
34+
}
35+
}
36+
}

0 commit comments

Comments
 (0)