Skip to content

Commit 5f5181e

Browse files
authored
test(ai-builder): Integrate structural workflow comparison (#22209)
1 parent 1a7089d commit 5f5181e

22 files changed

+4243
-14
lines changed

packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,20 @@
11
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
22
import { SystemMessage } from '@langchain/core/messages';
33
import { ChatPromptTemplate, HumanMessagePromptTemplate } from '@langchain/core/prompts';
4-
import { OperationalError } from 'n8n-workflow';
4+
import { readFileSync } from 'fs';
5+
import { jsonParse, OperationalError } from 'n8n-workflow';
6+
import { join } from 'path';
57
import { z } from 'zod';
68

9+
import type { SimpleWorkflow } from '../../src/types/workflow';
710
import type { TestCase } from '../types/evaluation';
811

12+
// Helper to load reference workflows
13+
function loadReferenceWorkflow(filename: string): SimpleWorkflow {
14+
const path = join(__dirname, '..', 'reference-workflows', filename);
15+
return jsonParse<SimpleWorkflow>(readFileSync(path, 'utf-8'));
16+
}
17+
918
const testCasesSchema = z.object({
1019
testCases: z.array(
1120
z.object({
@@ -105,6 +114,7 @@ export const basicTestCases: TestCase[] = [
105114
name: 'Summarize emails with AI',
106115
prompt:
107116
'Create an automation that runs on Monday mornings. It reads my Gmail inbox from the weekend, analyzes them with `gpt-4.1-mini` to find action items and priorities, and emails me a structured email using Gmail.',
117+
referenceWorkflow: loadReferenceWorkflow('email-summary.json'),
108118
},
109119
{
110120
id: 'ai-news-digest',
@@ -117,6 +127,7 @@ export const basicTestCases: TestCase[] = [
117127
name: 'Daily weather report',
118128
prompt:
119129
'Create an automation that checks the weather for my location every morning at 5 a.m using OpenWeather. Send me a short weather report by email using Gmail. Use OpenAI `gpt-4.1-mini` to write a short, fun formatted email body by adding personality when describing the weather and how the day might feel. Include all details relevant to decide on my plans and clothes for the day.',
130+
referenceWorkflow: loadReferenceWorkflow('daily-weather-report.json'),
120131
},
121132
{
122133
id: 'invoice-pipeline',
@@ -147,11 +158,13 @@ export const basicTestCases: TestCase[] = [
147158
name: 'Process large Google Sheets data',
148159
prompt:
149160
'Create a workflow that reads all rows from a Google Sheets document with thousands of customer records. For each row, call an external API to get additional customer data, process the response, and update the row with the enriched information. Handle rate limiting and errors gracefully.',
161+
referenceWorkflow: loadReferenceWorkflow('google-sheets-processing.json'),
150162
},
151163
{
152164
id: 'extract-from-file',
153165
name: 'Extract data from uploaded files',
154166
prompt:
155167
'Build a workflow that accepts file uploads through an n8n form. When users upload PDF documents, CSV files, or Excel spreadsheets, automatically extract the text content and data from these files. Transform the extracted data into a structured format and save it to a database or send it via email as a summary.',
168+
referenceWorkflow: loadReferenceWorkflow('extract-from-file.json'),
156169
},
157170
];

packages/@n8n/ai-workflow-builder.ee/evaluations/core/test-runner.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ export function createErrorResult(testCase: TestCase, error: unknown): TestResul
6161
agentPrompt: { violations: [], score: 0 },
6262
tools: { violations: [], score: 0 },
6363
fromAi: { violations: [], score: 0 },
64+
similarity: null,
6465
},
6566
generationTime: 0,
6667
error: errorMessage,
@@ -107,10 +108,11 @@ export async function runSingleTest(
107108
userPrompt: testCase.prompt,
108109
generatedWorkflow,
109110
referenceWorkflow: testCase.referenceWorkflow,
111+
referenceWorkflows: testCase.referenceWorkflows,
110112
};
111113

112114
const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
113-
const programmaticEvaluationResult = programmaticEvaluation(evaluationInput, nodeTypes);
115+
const programmaticEvaluationResult = await programmaticEvaluation(evaluationInput, nodeTypes);
114116

115117
return {
116118
testCase,

packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/evaluator.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ export function createLangsmithEvaluator(
122122
const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
123123

124124
// Run programmatic evaluation
125-
const programmaticResult = programmaticEvaluation(evaluationInput, parsedNodeTypes);
125+
const programmaticResult = await programmaticEvaluation(evaluationInput, parsedNodeTypes);
126126

127127
const results: LangsmithEvaluationResult[] = [];
128128

@@ -240,6 +240,11 @@ export function createLangsmithEvaluator(
240240
results.push(categoryToResult('programmatic.tools', programmaticResult.tools));
241241
results.push(categoryToResult('programmatic.fromAi', programmaticResult.fromAi));
242242

243+
// Add workflow similarity if available
244+
if (programmaticResult.similarity) {
245+
results.push(categoryToResult('programmatic.similarity', programmaticResult.similarity));
246+
}
247+
243248
return results;
244249
} catch (error) {
245250
const errorMessage = error instanceof Error ? error.message : String(error);

packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ export * from './connections';
33
export * from './from-ai';
44
export * from './tools';
55
export * from './trigger';
6+
export * from './workflow-similarity';
Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
import { mock } from 'jest-mock-extended';
2+
3+
import type { SimpleWorkflow } from '@/types';
4+
5+
import {
6+
evaluateWorkflowSimilarity,
7+
evaluateWorkflowSimilarityMultiple,
8+
} from './workflow-similarity';
9+
10+
// Mock node modules before any imports
11+
jest.mock('node:child_process', () => ({
12+
execFile: jest.fn(),
13+
}));
14+
15+
// Create the mock inside the factory - must use var for proper hoisting with jest.mock
16+
// eslint-disable-next-line no-var
17+
var mockExecFileAsync: jest.Mock;
18+
19+
jest.mock('node:util', () => {
20+
const mockFn = jest.fn();
21+
// Store reference so tests can access it
22+
mockExecFileAsync = mockFn;
23+
24+
return {
25+
promisify: jest.fn(() => mockFn),
26+
};
27+
});
28+
29+
jest.mock('node:fs/promises');
30+
31+
describe('evaluateWorkflowSimilarity', () => {
32+
const generatedWorkflow = mock<SimpleWorkflow>({
33+
name: 'Generated',
34+
nodes: [
35+
{
36+
id: '1',
37+
name: 'Trigger',
38+
type: 'n8n-nodes-base.manualTrigger',
39+
typeVersion: 1,
40+
position: [0, 0],
41+
parameters: {},
42+
},
43+
],
44+
connections: {},
45+
});
46+
47+
const groundTruthWorkflow = mock<SimpleWorkflow>({
48+
name: 'Ground Truth',
49+
nodes: [
50+
{
51+
id: '1',
52+
name: 'Trigger',
53+
type: 'n8n-nodes-base.manualTrigger',
54+
typeVersion: 1,
55+
position: [0, 0],
56+
parameters: {},
57+
},
58+
{
59+
id: '2',
60+
name: 'Code',
61+
type: 'n8n-nodes-base.code',
62+
typeVersion: 1,
63+
position: [200, 0],
64+
parameters: {},
65+
},
66+
],
67+
connections: {},
68+
});
69+
70+
beforeEach(() => {
71+
jest.clearAllMocks();
72+
});
73+
74+
describe('successful evaluation', () => {
75+
it('should parse Python output and map violations correctly', async () => {
76+
const mockPythonOutput = JSON.stringify({
77+
similarity_score: 0.75,
78+
edit_cost: 25,
79+
max_possible_cost: 100,
80+
top_edits: [
81+
{
82+
type: 'node_insert',
83+
description: 'Missing node: Code',
84+
cost: 15,
85+
priority: 'major',
86+
node_name: 'Code',
87+
},
88+
{
89+
type: 'edge_delete',
90+
description: 'Extra connection from Trigger to Code',
91+
cost: 10,
92+
priority: 'minor',
93+
},
94+
],
95+
metadata: {
96+
generated_nodes: 1,
97+
ground_truth_nodes: 2,
98+
config_name: 'standard',
99+
},
100+
});
101+
102+
mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
103+
104+
const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
105+
106+
expect(result.score).toBe(0.75);
107+
expect(result.violations).toHaveLength(2);
108+
expect(result.violations[0]).toEqual({
109+
name: 'workflow-similarity-node-insert',
110+
type: 'major',
111+
description: 'Missing node: Code',
112+
pointsDeducted: 15,
113+
});
114+
expect(result.violations[1]).toEqual({
115+
name: 'workflow-similarity-edge-delete',
116+
type: 'minor',
117+
description: 'Extra connection from Trigger to Code',
118+
pointsDeducted: 10,
119+
});
120+
});
121+
122+
it('should handle all edit types correctly', async () => {
123+
const mockPythonOutput = JSON.stringify({
124+
similarity_score: 0.5,
125+
edit_cost: 50,
126+
max_possible_cost: 100,
127+
top_edits: [
128+
{ type: 'node_insert', description: 'Insert', cost: 10, priority: 'major' },
129+
{ type: 'node_delete', description: 'Delete', cost: 10, priority: 'major' },
130+
{ type: 'node_substitute', description: 'Substitute', cost: 10, priority: 'major' },
131+
{ type: 'edge_insert', description: 'Edge insert', cost: 5, priority: 'minor' },
132+
{ type: 'edge_delete', description: 'Edge delete', cost: 5, priority: 'minor' },
133+
{ type: 'edge_substitute', description: 'Edge substitute', cost: 10, priority: 'major' },
134+
],
135+
metadata: {
136+
generated_nodes: 2,
137+
ground_truth_nodes: 2,
138+
config_name: 'standard',
139+
},
140+
});
141+
142+
mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
143+
144+
const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
145+
146+
expect(result.violations).toHaveLength(6);
147+
expect(result.violations[0].name).toBe('workflow-similarity-node-insert');
148+
expect(result.violations[1].name).toBe('workflow-similarity-node-delete');
149+
expect(result.violations[2].name).toBe('workflow-similarity-node-substitute');
150+
expect(result.violations[3].name).toBe('workflow-similarity-edge-insert');
151+
expect(result.violations[4].name).toBe('workflow-similarity-edge-delete');
152+
expect(result.violations[5].name).toBe('workflow-similarity-edge-substitute');
153+
});
154+
155+
it('should round cost values to integers', async () => {
156+
const mockPythonOutput = JSON.stringify({
157+
similarity_score: 0.85,
158+
edit_cost: 15.7,
159+
max_possible_cost: 100,
160+
top_edits: [
161+
{
162+
type: 'node_insert',
163+
description: 'Missing node',
164+
cost: 15.7,
165+
priority: 'major',
166+
},
167+
],
168+
metadata: {
169+
generated_nodes: 1,
170+
ground_truth_nodes: 2,
171+
config_name: 'standard',
172+
},
173+
});
174+
175+
mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
176+
177+
const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
178+
179+
expect(result.violations[0].pointsDeducted).toBe(16);
180+
});
181+
});
182+
183+
describe('error handling', () => {
184+
it('should handle uvx command not found error', async () => {
185+
const error = Object.assign(new Error('Command not found'), { code: 'ENOENT' });
186+
mockExecFileAsync.mockRejectedValue(error);
187+
188+
await expect(
189+
evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
190+
).rejects.toThrow('uvx command not found');
191+
});
192+
193+
it('should handle timeout error', async () => {
194+
const error = Object.assign(new Error('Timeout'), { killed: true });
195+
mockExecFileAsync.mockRejectedValue(error);
196+
197+
await expect(
198+
evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
199+
).rejects.toThrow('Workflow comparison timed out');
200+
});
201+
202+
it('should handle Python script errors with empty output', async () => {
203+
const error = Object.assign(new Error('Python error'), {
204+
stdout: '',
205+
stderr: 'Something went wrong',
206+
code: 1,
207+
});
208+
mockExecFileAsync.mockRejectedValue(error);
209+
210+
await expect(
211+
evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
212+
).rejects.toThrow('Workflow similarity evaluation failed');
213+
});
214+
215+
it('should accept non-zero exit code if Python outputs valid JSON', async () => {
216+
const mockPythonOutput = JSON.stringify({
217+
similarity_score: 0.3,
218+
edit_cost: 70,
219+
max_possible_cost: 100,
220+
top_edits: [
221+
{
222+
type: 'node_delete',
223+
description: 'Major difference',
224+
cost: 70,
225+
priority: 'critical',
226+
},
227+
],
228+
metadata: {
229+
generated_nodes: 1,
230+
ground_truth_nodes: 2,
231+
config_name: 'standard',
232+
},
233+
});
234+
235+
const error = Object.assign(new Error('Non-zero exit'), {
236+
stdout: mockPythonOutput,
237+
stderr: 'Warning: similarity below threshold',
238+
code: 1,
239+
});
240+
mockExecFileAsync.mockRejectedValue(error);
241+
242+
const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
243+
244+
expect(result.score).toBe(0.3);
245+
expect(result.violations).toHaveLength(1);
246+
expect(result.violations[0].name).toBe('workflow-similarity-node-delete');
247+
});
248+
});
249+
250+
describe('evaluateWorkflowSimilarityMultiple', () => {
251+
it('should return result with highest similarity score', async () => {
252+
const referenceWorkflows = [
253+
mock<SimpleWorkflow>({ name: 'Ref1', nodes: [], connections: {} }),
254+
mock<SimpleWorkflow>({ name: 'Ref2', nodes: [], connections: {} }),
255+
mock<SimpleWorkflow>({ name: 'Ref3', nodes: [], connections: {} }),
256+
];
257+
258+
let callCount = 0;
259+
mockExecFileAsync.mockImplementation(async () => {
260+
callCount++;
261+
const score = callCount === 2 ? 0.9 : 0.5; // Second call has highest score
262+
const mockOutput = JSON.stringify({
263+
similarity_score: score,
264+
edit_cost: 10,
265+
max_possible_cost: 100,
266+
top_edits: [],
267+
metadata: { generated_nodes: 1, ground_truth_nodes: 1, config_name: 'standard' },
268+
});
269+
return { stdout: mockOutput, stderr: '' };
270+
});
271+
272+
const result = await evaluateWorkflowSimilarityMultiple(
273+
generatedWorkflow,
274+
referenceWorkflows,
275+
);
276+
277+
expect(result.score).toBe(0.9);
278+
expect(mockExecFileAsync).toHaveBeenCalledTimes(3);
279+
});
280+
281+
it('should throw error when no reference workflows provided', async () => {
282+
await expect(evaluateWorkflowSimilarityMultiple(generatedWorkflow, [])).rejects.toThrow(
283+
'At least one reference workflow is required',
284+
);
285+
});
286+
});
287+
});

0 commit comments

Comments
 (0)