n8n-io
diff --git a/‎packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts‎
Lines changed: 14 additions & 1 deletion b/‎packages/@n8n/ai-workflow-builder.ee/evaluations/chains/test-case-generator.ts‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎packages/@n8n/ai-workflow-builder.ee/evaluations/core/test-runner.ts‎
Lines changed: 3 additions & 1 deletion b/‎packages/@n8n/ai-workflow-builder.ee/evaluations/core/test-runner.ts‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/evaluator.ts‎
Lines changed: 6 additions & 1 deletion b/‎packages/@n8n/ai-workflow-builder.ee/evaluations/langsmith/evaluator.ts‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/index.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/workflow-similarity.test.ts‎
Lines changed: 287 additions & 0 deletions b/‎packages/@n8n/ai-workflow-builder.ee/evaluations/programmatic/evaluators/workflow-similarity.test.ts‎
Lines changed: 287 additions & 0 deletions
@@ -1,11 +1,20 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { SystemMessage } from '@langchain/core/messages';
 import { ChatPromptTemplate, HumanMessagePromptTemplate } from '@langchain/core/prompts';
-import { OperationalError } from 'n8n-workflow';
+import { readFileSync } from 'fs';
+import { jsonParse, OperationalError } from 'n8n-workflow';
+import { join } from 'path';
 import { z } from 'zod';
 
+import type { SimpleWorkflow } from '../../src/types/workflow';
 import type { TestCase } from '../types/evaluation';
 
+// Helper to load reference workflows
+function loadReferenceWorkflow(filename: string): SimpleWorkflow {
+	const path = join(__dirname, '..', 'reference-workflows', filename);
+	return jsonParse<SimpleWorkflow>(readFileSync(path, 'utf-8'));
+}
+
 const testCasesSchema = z.object({
 	testCases: z.array(
 		z.object({
@@ -105,6 +114,7 @@ export const basicTestCases: TestCase[] = [
 		name: 'Summarize emails with AI',
 		prompt:
 			'Create an automation that runs on Monday mornings. It reads my Gmail inbox from the weekend, analyzes them with `gpt-4.1-mini` to find action items and priorities, and emails me a structured email using Gmail.',
+		referenceWorkflow: loadReferenceWorkflow('email-summary.json'),
 	},
 	{
 		id: 'ai-news-digest',
@@ -117,6 +127,7 @@ export const basicTestCases: TestCase[] = [
 		name: 'Daily weather report',
 		prompt:
 			'Create an automation that checks the weather for my location every morning at 5 a.m using OpenWeather. Send me a short weather report by email using Gmail. Use OpenAI `gpt-4.1-mini` to write a short, fun formatted email body by adding personality when describing the weather and how the day might feel. Include all details relevant to decide on my plans and clothes for the day.',
+		referenceWorkflow: loadReferenceWorkflow('daily-weather-report.json'),
 	},
 	{
 		id: 'invoice-pipeline',
@@ -147,11 +158,13 @@ export const basicTestCases: TestCase[] = [
 		name: 'Process large Google Sheets data',
 		prompt:
 			'Create a workflow that reads all rows from a Google Sheets document with thousands of customer records. For each row, call an external API to get additional customer data, process the response, and update the row with the enriched information. Handle rate limiting and errors gracefully.',
+		referenceWorkflow: loadReferenceWorkflow('google-sheets-processing.json'),
 	},
 	{
 		id: 'extract-from-file',
 		name: 'Extract data from uploaded files',
 		prompt:
 			'Build a workflow that accepts file uploads through an n8n form. When users upload PDF documents, CSV files, or Excel spreadsheets, automatically extract the text content and data from these files. Transform the extracted data into a structured format and save it to a database or send it via email as a summary.',
+		referenceWorkflow: loadReferenceWorkflow('extract-from-file.json'),
 	},
 ];
@@ -61,6 +61,7 @@ export function createErrorResult(testCase: TestCase, error: unknown): TestResul
 			agentPrompt: { violations: [], score: 0 },
 			tools: { violations: [], score: 0 },
 			fromAi: { violations: [], score: 0 },
+			similarity: null,
 		},
 		generationTime: 0,
 		error: errorMessage,
@@ -107,10 +108,11 @@ export async function runSingleTest(
 			userPrompt: testCase.prompt,
 			generatedWorkflow,
 			referenceWorkflow: testCase.referenceWorkflow,
+			referenceWorkflows: testCase.referenceWorkflows,
 		};
 
 		const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
-		const programmaticEvaluationResult = programmaticEvaluation(evaluationInput, nodeTypes);
+		const programmaticEvaluationResult = await programmaticEvaluation(evaluationInput, nodeTypes);
 
 		return {
 			testCase,
 
@@ -122,7 +122,7 @@ export function createLangsmithEvaluator(
 			const evaluationResult = await evaluateWorkflow(llm, evaluationInput);
 
 			// Run programmatic evaluation
-			const programmaticResult = programmaticEvaluation(evaluationInput, parsedNodeTypes);
+			const programmaticResult = await programmaticEvaluation(evaluationInput, parsedNodeTypes);
 
 			const results: LangsmithEvaluationResult[] = [];
 
@@ -240,6 +240,11 @@ export function createLangsmithEvaluator(
 			results.push(categoryToResult('programmatic.tools', programmaticResult.tools));
 			results.push(categoryToResult('programmatic.fromAi', programmaticResult.fromAi));
 
+			// Add workflow similarity if available
+			if (programmaticResult.similarity) {
+				results.push(categoryToResult('programmatic.similarity', programmaticResult.similarity));
+			}
+
 			return results;
 		} catch (error) {
 			const errorMessage = error instanceof Error ? error.message : String(error);
 
@@ -3,3 +3,4 @@ export * from './connections';
 export * from './from-ai';
 export * from './tools';
 export * from './trigger';
+export * from './workflow-similarity';
@@ -0,0 +1,287 @@
+import { mock } from 'jest-mock-extended';
+
+import type { SimpleWorkflow } from '@/types';
+
+import {
+	evaluateWorkflowSimilarity,
+	evaluateWorkflowSimilarityMultiple,
+} from './workflow-similarity';
+
+// Mock node modules before any imports
+jest.mock('node:child_process', () => ({
+	execFile: jest.fn(),
+}));
+
+// Create the mock inside the factory - must use var for proper hoisting with jest.mock
+// eslint-disable-next-line no-var
+var mockExecFileAsync: jest.Mock;
+
+jest.mock('node:util', () => {
+	const mockFn = jest.fn();
+	// Store reference so tests can access it
+	mockExecFileAsync = mockFn;
+
+	return {
+		promisify: jest.fn(() => mockFn),
+	};
+});
+
+jest.mock('node:fs/promises');
+
+describe('evaluateWorkflowSimilarity', () => {
+	const generatedWorkflow = mock<SimpleWorkflow>({
+		name: 'Generated',
+		nodes: [
+			{
+				id: '1',
+				name: 'Trigger',
+				type: 'n8n-nodes-base.manualTrigger',
+				typeVersion: 1,
+				position: [0, 0],
+				parameters: {},
+			},
+		],
+		connections: {},
+	});
+
+	const groundTruthWorkflow = mock<SimpleWorkflow>({
+		name: 'Ground Truth',
+		nodes: [
+			{
+				id: '1',
+				name: 'Trigger',
+				type: 'n8n-nodes-base.manualTrigger',
+				typeVersion: 1,
+				position: [0, 0],
+				parameters: {},
+			},
+			{
+				id: '2',
+				name: 'Code',
+				type: 'n8n-nodes-base.code',
+				typeVersion: 1,
+				position: [200, 0],
+				parameters: {},
+			},
+		],
+		connections: {},
+	});
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	describe('successful evaluation', () => {
+		it('should parse Python output and map violations correctly', async () => {
+			const mockPythonOutput = JSON.stringify({
+				similarity_score: 0.75,
+				edit_cost: 25,
+				max_possible_cost: 100,
+				top_edits: [
+					{
+						type: 'node_insert',
+						description: 'Missing node: Code',
+						cost: 15,
+						priority: 'major',
+						node_name: 'Code',
+					},
+					{
+						type: 'edge_delete',
+						description: 'Extra connection from Trigger to Code',
+						cost: 10,
+						priority: 'minor',
+					},
+				],
+				metadata: {
+					generated_nodes: 1,
+					ground_truth_nodes: 2,
+					config_name: 'standard',
+				},
+			});
+
+			mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
+
+			const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
+
+			expect(result.score).toBe(0.75);
+			expect(result.violations).toHaveLength(2);
+			expect(result.violations[0]).toEqual({
+				name: 'workflow-similarity-node-insert',
+				type: 'major',
+				description: 'Missing node: Code',
+				pointsDeducted: 15,
+			});
+			expect(result.violations[1]).toEqual({
+				name: 'workflow-similarity-edge-delete',
+				type: 'minor',
+				description: 'Extra connection from Trigger to Code',
+				pointsDeducted: 10,
+			});
+		});
+
+		it('should handle all edit types correctly', async () => {
+			const mockPythonOutput = JSON.stringify({
+				similarity_score: 0.5,
+				edit_cost: 50,
+				max_possible_cost: 100,
+				top_edits: [
+					{ type: 'node_insert', description: 'Insert', cost: 10, priority: 'major' },
+					{ type: 'node_delete', description: 'Delete', cost: 10, priority: 'major' },
+					{ type: 'node_substitute', description: 'Substitute', cost: 10, priority: 'major' },
+					{ type: 'edge_insert', description: 'Edge insert', cost: 5, priority: 'minor' },
+					{ type: 'edge_delete', description: 'Edge delete', cost: 5, priority: 'minor' },
+					{ type: 'edge_substitute', description: 'Edge substitute', cost: 10, priority: 'major' },
+				],
+				metadata: {
+					generated_nodes: 2,
+					ground_truth_nodes: 2,
+					config_name: 'standard',
+				},
+			});
+
+			mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
+
+			const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
+
+			expect(result.violations).toHaveLength(6);
+			expect(result.violations[0].name).toBe('workflow-similarity-node-insert');
+			expect(result.violations[1].name).toBe('workflow-similarity-node-delete');
+			expect(result.violations[2].name).toBe('workflow-similarity-node-substitute');
+			expect(result.violations[3].name).toBe('workflow-similarity-edge-insert');
+			expect(result.violations[4].name).toBe('workflow-similarity-edge-delete');
+			expect(result.violations[5].name).toBe('workflow-similarity-edge-substitute');
+		});
+
+		it('should round cost values to integers', async () => {
+			const mockPythonOutput = JSON.stringify({
+				similarity_score: 0.85,
+				edit_cost: 15.7,
+				max_possible_cost: 100,
+				top_edits: [
+					{
+						type: 'node_insert',
+						description: 'Missing node',
+						cost: 15.7,
+						priority: 'major',
+					},
+				],
+				metadata: {
+					generated_nodes: 1,
+					ground_truth_nodes: 2,
+					config_name: 'standard',
+				},
+			});
+
+			mockExecFileAsync.mockResolvedValue({ stdout: mockPythonOutput, stderr: '' });
+
+			const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
+
+			expect(result.violations[0].pointsDeducted).toBe(16);
+		});
+	});
+
+	describe('error handling', () => {
+		it('should handle uvx command not found error', async () => {
+			const error = Object.assign(new Error('Command not found'), { code: 'ENOENT' });
+			mockExecFileAsync.mockRejectedValue(error);
+
+			await expect(
+				evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
+			).rejects.toThrow('uvx command not found');
+		});
+
+		it('should handle timeout error', async () => {
+			const error = Object.assign(new Error('Timeout'), { killed: true });
+			mockExecFileAsync.mockRejectedValue(error);
+
+			await expect(
+				evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
+			).rejects.toThrow('Workflow comparison timed out');
+		});
+
+		it('should handle Python script errors with empty output', async () => {
+			const error = Object.assign(new Error('Python error'), {
+				stdout: '',
+				stderr: 'Something went wrong',
+				code: 1,
+			});
+			mockExecFileAsync.mockRejectedValue(error);
+
+			await expect(
+				evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow),
+			).rejects.toThrow('Workflow similarity evaluation failed');
+		});
+
+		it('should accept non-zero exit code if Python outputs valid JSON', async () => {
+			const mockPythonOutput = JSON.stringify({
+				similarity_score: 0.3,
+				edit_cost: 70,
+				max_possible_cost: 100,
+				top_edits: [
+					{
+						type: 'node_delete',
+						description: 'Major difference',
+						cost: 70,
+						priority: 'critical',
+					},
+				],
+				metadata: {
+					generated_nodes: 1,
+					ground_truth_nodes: 2,
+					config_name: 'standard',
+				},
+			});
+
+			const error = Object.assign(new Error('Non-zero exit'), {
+				stdout: mockPythonOutput,
+				stderr: 'Warning: similarity below threshold',
+				code: 1,
+			});
+			mockExecFileAsync.mockRejectedValue(error);
+
+			const result = await evaluateWorkflowSimilarity(generatedWorkflow, groundTruthWorkflow);
+
+			expect(result.score).toBe(0.3);
+			expect(result.violations).toHaveLength(1);
+			expect(result.violations[0].name).toBe('workflow-similarity-node-delete');
+		});
+	});
+
+	describe('evaluateWorkflowSimilarityMultiple', () => {
+		it('should return result with highest similarity score', async () => {
+			const referenceWorkflows = [
+				mock<SimpleWorkflow>({ name: 'Ref1', nodes: [], connections: {} }),
+				mock<SimpleWorkflow>({ name: 'Ref2', nodes: [], connections: {} }),
+				mock<SimpleWorkflow>({ name: 'Ref3', nodes: [], connections: {} }),
+			];
+
+			let callCount = 0;
+			mockExecFileAsync.mockImplementation(async () => {
+				callCount++;
+				const score = callCount === 2 ? 0.9 : 0.5; // Second call has highest score
+				const mockOutput = JSON.stringify({
+					similarity_score: score,
+					edit_cost: 10,
+					max_possible_cost: 100,
+					top_edits: [],
+					metadata: { generated_nodes: 1, ground_truth_nodes: 1, config_name: 'standard' },
+				});
+				return { stdout: mockOutput, stderr: '' };
+			});
+
+			const result = await evaluateWorkflowSimilarityMultiple(
+				generatedWorkflow,
+				referenceWorkflows,
+			);
+
+			expect(result.score).toBe(0.9);
+			expect(mockExecFileAsync).toHaveBeenCalledTimes(3);
+		});
+
+		it('should throw error when no reference workflows provided', async () => {
+			await expect(evaluateWorkflowSimilarityMultiple(generatedWorkflow, [])).rejects.toThrow(
+				'At least one reference workflow is required',
+			);
+		});
+	});
+});