From 58b265172edfdd4f312d201d36dee881f6c6f37f Mon Sep 17 00:00:00 2001 From: aliyanishfaq Date: Wed, 27 Aug 2025 15:33:27 -0700 Subject: [PATCH 1/2] feat: evaluation code added --- apps/open-swe/evals/evaluator.ts | 235 ++++++++++++++- apps/open-swe/evals/langgraph.eval.ts | 343 +++++++++++++--------- apps/open-swe/evals/open-swe-types.ts | 16 + apps/open-swe/evals/prompts.ts | 158 +++++++++- apps/open-swe/evals/tests.ts | 75 ++++- apps/open-swe/evals/utils/dataset.ts | 408 ++++++++++++++++++++++++++ apps/open-swe/evals/utils/retry.ts | 2 +- 7 files changed, 1084 insertions(+), 153 deletions(-) create mode 100644 apps/open-swe/evals/utils/dataset.ts diff --git a/apps/open-swe/evals/evaluator.ts b/apps/open-swe/evals/evaluator.ts index d6fd19b5b..bcb1c360d 100644 --- a/apps/open-swe/evals/evaluator.ts +++ b/apps/open-swe/evals/evaluator.ts @@ -8,13 +8,141 @@ import { TargetRepository } from "@openswe/shared/open-swe/types"; import { cloneRepo } from "../src/utils/github/git.js"; import { getRepoAbsolutePath } from "@openswe/shared/git"; import { SimpleEvaluationResult } from "langsmith/vitest"; -import { runRuffLint, runMyPyTypeCheck } from "./tests.js"; -import { setupEnv, ENV_CONSTANTS } from "../src/utils/env-setup.js"; +import { + runRuffLint, + runMyPyTypeCheck, + runLangGraphEvaluation, +} from "./tests.js"; +import { writeFile } from "../src/utils/read-write.js"; +import * as fs from "fs"; +import * as path from "path"; const logger = createLogger(LogLevel.INFO, "Evaluator "); -// Use shared constants from env-setup utility -const { RUN_PYTHON_IN_VENV } = ENV_CONSTANTS; +const VENV_PATH = ".venv"; +const RUN_PYTHON_IN_VENV = `${VENV_PATH}/bin/python`; +const RUN_PIP_IN_VENV = `${VENV_PATH}/bin/pip`; + +/** + * Setup Python environment with requirements.txt + ruff + mypy + */ +async function setupEnv( + sandbox: Sandbox, + absoluteRepoDir: string, + envVars?: Record, +): Promise { + logger.info("Setting up Python environment..."); + + const createVenvCommand = "python -m venv .venv"; + const createVenvRes = await sandbox.process.executeCommand( + createVenvCommand, + absoluteRepoDir, + envVars, + TIMEOUT_SEC, + ); + if (createVenvRes.exitCode !== 0) { + logger.error("Failed to create virtual environment", { + createVenvCommand, + createVenvRes, + }); + return false; + } + + const upgradePipRes = await sandbox.process.executeCommand( + `${RUN_PIP_IN_VENV} install --upgrade pip`, + absoluteRepoDir, + envVars, + TIMEOUT_SEC, + ); + if (upgradePipRes.exitCode !== 0) { + logger.warn("Failed to upgrade pip, continuing anyway", { upgradePipRes }); + } + + const requirementsExistRes = await sandbox.process.executeCommand( + "test -f requirements.txt", + absoluteRepoDir, + envVars, + TIMEOUT_SEC, + ); + + if (requirementsExistRes.exitCode === 0) { + logger.info("Found requirements.txt, installing..."); + const installReqRes = await sandbox.process.executeCommand( + `${RUN_PIP_IN_VENV} install -r requirements.txt`, + absoluteRepoDir, + envVars, + TIMEOUT_SEC * 3, + ); + if (installReqRes.exitCode !== 0) { + logger.warn("Failed to install requirements.txt, continuing anyway", { + installReqRes, + }); + } + } else { + logger.info("No requirements.txt found, skipping repository dependencies"); + } + + // Install evaluation-specific dependencies + logger.info("Installing evaluation dependencies..."); + const installEvalDepsRes = await sandbox.process.executeCommand( + `${RUN_PIP_IN_VENV} install langchain langchain-core langchain-openai pydantic openai`, + absoluteRepoDir, + envVars, + TIMEOUT_SEC * 2, + ); + if (installEvalDepsRes.exitCode !== 0) { + logger.warn( + "Failed to install evaluation dependencies, continuing anyway", + { + installEvalDepsRes, + }, + ); + } + + const installAnalysisToolsRes = await sandbox.process.executeCommand( + `${RUN_PIP_IN_VENV} install ruff mypy`, + absoluteRepoDir, + envVars, + TIMEOUT_SEC, + ); + if (installAnalysisToolsRes.exitCode !== 0) { + logger.error("Failed to install ruff and mypy", { + installAnalysisToolsRes, + }); + return false; + } + + logger.info("Copying LangGraph evaluation script to sandbox..."); + try { + const evalScriptPath = path.join( + __dirname, + "scripts", + "langgraph_check.py", + ); + const evalScriptContent = fs.readFileSync(evalScriptPath, "utf8"); + + const { success: copyScriptSuccess, output: copyScriptOutput } = + await writeFile({ + sandbox, + filePath: "langgraph_check.py", + content: evalScriptContent, + workDir: absoluteRepoDir, + }); + + if (!copyScriptSuccess) { + logger.warn("Failed to copy LangGraph evaluation script", { + copyScriptOutput, + }); + } else { + logger.info("Successfully copied LangGraph evaluation script to sandbox"); + } + } catch (error) { + logger.warn("Error copying LangGraph evaluation script", { error }); + } + + logger.info("Environment setup completed successfully"); + return true; +} /** * Runs ruff and mypy analysis on all Python files in the repository @@ -22,16 +150,25 @@ const { RUN_PYTHON_IN_VENV } = ENV_CONSTANTS; async function runCodeTests( sandbox: Sandbox, absoluteRepoDir: string, -): Promise<{ ruffScore: number; mypyScore: number; details: CodeTestDetails }> { + openSWEInputs: OpenSWEInput, + envVars?: Record, +): Promise<{ + ruffScore: number; + mypyScore: number; + langGraphScore: number; + details: CodeTestDetails; +}> { logger.info("Running code analysis on all Python files in repository"); const testResults: { ruffScore: number; mypyScore: number; + langGraphScore: number; details: CodeTestDetails; } = { ruffScore: 0, mypyScore: 0, + langGraphScore: 0, details: { ruff: { issues: [], @@ -41,20 +178,30 @@ async function runCodeTests( issues: [], error: null, }, + langGraph: { + explanation: "", + error: null, + }, }, }; - const [ruffLint, mypyCheck] = await Promise.all([ + const [ruffLint, mypyCheck, langGraphEval] = await Promise.all([ runRuffLint(sandbox, { command: `${RUN_PYTHON_IN_VENV} -m ruff check . --output-format=json`, workingDir: absoluteRepoDir, - env: undefined, + env: envVars, timeoutSec: TIMEOUT_SEC * 3, }), runMyPyTypeCheck(sandbox, { command: `${RUN_PYTHON_IN_VENV} -m mypy . --no-error-summary --show-error-codes --no-color-output`, workingDir: absoluteRepoDir, - env: undefined, + env: envVars, + timeoutSec: TIMEOUT_SEC * 3, + }), + runLangGraphEvaluation(sandbox, { + command: `${RUN_PYTHON_IN_VENV} -m langgraph_check agent.py -i "${openSWEInputs.test_input}" -g "${openSWEInputs.ground_truth}"`, + workingDir: absoluteRepoDir, + env: envVars, timeoutSec: TIMEOUT_SEC * 3, }), ]); @@ -62,6 +209,7 @@ async function runCodeTests( Object.assign(testResults, { ruffScore: ruffLint.ruffScore, mypyScore: mypyCheck.mypyScore, + langGraphScore: langGraphEval.langGraphScore, details: { ruff: { issues: ruffLint.issues, @@ -71,14 +219,20 @@ async function runCodeTests( issues: mypyCheck.issues, error: mypyCheck.error, }, + langGraph: { + explanation: langGraphEval.explanation, + error: langGraphEval.error, + }, }, }); logger.info("Code tests completed", { ruffScore: testResults.ruffScore, mypyScore: testResults.mypyScore, + langGraphScore: testResults.langGraphScore, ruffIssues: testResults.details.ruff.issues.length, mypyIssues: testResults.details.mypy.issues.length, + langGraphExplanation: testResults.details.langGraph.explanation, }); return testResults; @@ -120,7 +274,49 @@ export async function evaluator(inputs: { const absoluteRepoDir = getRepoAbsolutePath(output.targetRepository); - const envSetupSuccess = await setupEnv(sandbox, absoluteRepoDir); + const envVars: Record = {}; + const apiKeys = [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "LANGCHAIN_API_KEY", + "LANGCHAIN_TRACING_V2", + "LANGCHAIN_PROJECT", + "GOOGLE_API_KEY", + "TAVILY_API_KEY", + ]; + + apiKeys.forEach((key) => { + if (process.env[key]) { + envVars[key] = process.env[key]!; + logger.info(`Added environment variable: ${key}`); + } + }); + + logger.info( + `Syncing to latest state of solution branch: ${solutionBranch}`, + ); + + const updateBranchRes = await sandbox.process.executeCommand( + `git fetch origin ${solutionBranch} && git checkout -B ${solutionBranch} FETCH_HEAD`, + absoluteRepoDir, + envVars, + TIMEOUT_SEC, + ); + + if (updateBranchRes.exitCode !== 0) { + logger.error("Failed to update solution branch", { + solutionBranch, + updateResult: updateBranchRes, + }); + throw new Error(`Failed to update solution branch: ${solutionBranch}`); + } + + logger.info("Git update result:", { + exitCode: updateBranchRes.exitCode, + result: updateBranchRes.result, + }); + + const envSetupSuccess = await setupEnv(sandbox, absoluteRepoDir, envVars); if (!envSetupSuccess) { logger.error("Failed to setup environment"); return [ @@ -131,14 +327,24 @@ export async function evaluator(inputs: { ]; } - const analysisResult = await runCodeTests(sandbox, absoluteRepoDir); + const analysisResult = await runCodeTests( + sandbox, + absoluteRepoDir, + openSWEInputs, + envVars, + ); - const overallScore = analysisResult.ruffScore + analysisResult.mypyScore; + const overallScore = + analysisResult.ruffScore + + analysisResult.mypyScore + + analysisResult.langGraphScore; logger.info("Evaluation completed", { overallScore, ruffScore: analysisResult.ruffScore, mypyScore: analysisResult.mypyScore, + langGraphScore: analysisResult.langGraphScore, + langGraphExplanation: analysisResult.details.langGraph.explanation, repo: openSWEInputs.repo, originalBranch: openSWEInputs.branch, solutionBranch, @@ -152,10 +358,17 @@ export async function evaluator(inputs: { { key: "ruff-score", score: analysisResult.ruffScore, + comment: analysisResult.details.ruff.issues.join("\n"), }, { key: "mypy-score", score: analysisResult.mypyScore, + comment: analysisResult.details.mypy.issues.join("\n"), + }, + { + key: "langgraph-score", + score: analysisResult.langGraphScore, + comment: analysisResult.details.langGraph.explanation, }, ]; } catch (error) { diff --git a/apps/open-swe/evals/langgraph.eval.ts b/apps/open-swe/evals/langgraph.eval.ts index 954e5f8d7..13fd03005 100644 --- a/apps/open-swe/evals/langgraph.eval.ts +++ b/apps/open-swe/evals/langgraph.eval.ts @@ -12,10 +12,16 @@ import { ManagerGraphState } from "@openswe/shared/open-swe/manager/types"; import { PlannerGraphState } from "@openswe/shared/open-swe/planner/types"; import { GraphState } from "@openswe/shared/open-swe/types"; import { withRetry } from "./utils/retry.js"; +import { DATASET } from "./utils/dataset.js"; const logger = createLogger(LogLevel.DEBUG, "Evaluator"); -const DATASET_NAME = process.env.DATASET_NAME || ""; +// Configuration constants +const RUN_AGENT_PIPELINE = process.env.RUN_AGENT_PIPELINE === "true" || true; +let programmerRunUrl = + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/eba94921-7f40-4be0-b153-e88ab6fdcfdd/r/"; +const DATASET_NAME = + process.env.DATASET_NAME || "aliyan-open-swe-langgraph-eval"; // const RUN_NAME = `${DATASET_NAME}-${new Date().toISOString().replace(/[:.]/g, '-')}`; // async function loadDataset(): Promise { @@ -37,34 +43,6 @@ const DATASET_NAME = process.env.DATASET_NAME || ""; // })), // ); -const DATASET = [ - { - inputs: { - repo: "mai-sandbox/open-swe_content_team_eval", - branch: "main", - user_input: `I have implemented a multi-agent content creation system using LangGraph that orchestrates collaboration between specialized agents. The system is experiencing multiple runtime errors and workflow failures that prevent proper execution. - -System Architecture -The application implements a three-agent architecture: - -Research Agent: Utilizes web search tools to gather information on specified topics -Writer Agent: Creates content based on research findings with creative temperature settings -Reviewer Agent: Provides feedback using fact-checking tools and determines revision needs - -Expected Workflow -User Request → Research Agent → Writer Agent → Reviewer Agent → [Revision Loop if needed] → Final Content - -Current Issues - -Runtime Errors: Application fails to start with import and graph compilation errors -Agent Handoff Failures: Agents are not properly transferring control and context -Tool Integration Problems: Tool calling mechanisms are not functioning correctly -State Management Issues: Shared state is not being updated correctly across agent transitions -Routing Logic Failures: Conditional edges and workflow routing are broken`, - }, - }, -]; - logger.info(`Starting evals over ${DATASET.length} examples...`); //const LANGGRAPH_URL = process.env.LANGGRAPH_URL || "http://localhost:2024"; @@ -102,116 +80,211 @@ ls.describe(DATASET_NAME, () => { repo: inputs.repo, }); - // Run the agent with user input - let managerRun; - try { - managerRun = await withRetry(() => - lgClient.runs.wait(threadId, MANAGER_GRAPH_ID, { - input, - config: { - recursion_limit: 250, + let branchName: string; + + if (RUN_AGENT_PIPELINE) { + logger.info("Running full agent pipeline..."); + + // Run the agent with user input + let managerRun; + try { + managerRun = await withRetry(() => + lgClient.runs.wait(threadId, MANAGER_GRAPH_ID, { + input, + config: { + recursion_limit: 400, + }, + ifNotExists: "create", + }), + ); + } catch (error) { + logger.error("Error in manager run", { + thread_id: threadId, + error: + error instanceof Error + ? { + message: error.message, + stack: error.stack, + name: error.name, + cause: error.cause, + } + : error, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { issues: [], error: "Error in manager run" }, + mypy: { issues: [], error: "Error in manager run" }, + langGraph: { explanation: "", error: "Error in manager run" }, }, - ifNotExists: "create", - }), - ); - } catch (error) { - logger.error("Error in manager run", { - thread_id: threadId, - error: - error instanceof Error - ? { - message: error.message, - stack: error.stack, - name: error.name, - cause: error.cause, - } - : error, - }); - return; // instead of skipping, we should award 0 points - } - - const managerState = managerRun as unknown as ManagerGraphState; - const plannerSession = managerState?.plannerSession; - - if (!plannerSession) { - logger.info("Agent did not create a planner session", { - thread_id: threadId, - }); - return; // instead of skipping, we should award 0 points - } - - let plannerRun; - try { - plannerRun = await withRetry(() => - lgClient.runs.join(plannerSession.threadId, plannerSession.runId), - ); - } catch (error) { - logger.error("Error joining planner run", { - thread_id: threadId, - plannerSession, - error: - error instanceof Error - ? { - message: error.message, - stack: error.stack, - name: error.name, - cause: error.cause, - } - : error, - }); - return; // instead of skipping, we should award 0 points - } - - // Type-safe access to planner run state - const plannerState = plannerRun as unknown as PlannerGraphState; - const programmerSession = plannerState?.programmerSession; - - if (!programmerSession) { - logger.info("Agent did not create a programmer session", { - thread_id: threadId, - }); - return; // instead of skipping, we should award 0 points - } + }; + } + + const managerState = managerRun as unknown as ManagerGraphState; + const plannerSession = managerState?.plannerSession; + + if (!plannerSession) { + logger.info("Agent did not create a planner session", { + thread_id: threadId, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { + issues: [], + error: "Agent did not create a planner session", + }, + mypy: { + issues: [], + error: "Agent did not create a planner session", + }, + langGraph: { + explanation: "", + error: "Agent did not create a planner session", + }, + }, + }; + } + + let plannerRun; + try { + plannerRun = await withRetry(() => + lgClient.runs.join(plannerSession.threadId, plannerSession.runId), + ); + } catch (error) { + logger.error("Error joining planner run", { + thread_id: threadId, + plannerSession, + error: + error instanceof Error + ? { + message: error.message, + stack: error.stack, + name: error.name, + cause: error.cause, + } + : error, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { issues: [], error: "Error joining planner run" }, + mypy: { issues: [], error: "Error joining planner run" }, + langGraph: { + explanation: "", + error: "Error joining planner run", + }, + }, + }; + } + + // Type-safe access to planner run state + const plannerState = plannerRun as unknown as PlannerGraphState; + const programmerSession = plannerState?.programmerSession; + + if (!programmerSession) { + logger.info("Agent did not create a programmer session", { + thread_id: threadId, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { + issues: [], + error: "Agent did not create a programmer session", + }, + mypy: { + issues: [], + error: "Agent did not create a programmer session", + }, + langGraph: { + explanation: "", + error: "Agent did not create a programmer session", + }, + }, + }; + } + + let programmerRun; + programmerRunUrl = `${programmerRunUrl}${programmerSession.runId}`; + try { + programmerRun = await withRetry(() => + lgClient.runs.join( + programmerSession.threadId, + programmerSession.runId, + ), + ); + } catch (error) { + logger.error("Error joining programmer run", { + thread_id: threadId, + programmerSession, + error: + error instanceof Error + ? { + message: error.message, + stack: error.stack, + name: error.name, + cause: error.cause, + } + : error, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { issues: [], error: "Error joining programmer run" }, + mypy: { issues: [], error: "Error joining programmer run" }, + langGraph: { + explanation: "", + error: "Error joining programmer run", + }, + }, + }; + } + + const programmerState = programmerRun as unknown as GraphState; + const agentBranchName = programmerState?.branchName; + + if (!agentBranchName) { + logger.info("Agent did not create a branch", { + thread_id: threadId, + }); + return { + ruffScore: 0, + mypyScore: 0, + langGraphScore: 0, + details: { + ruff: { issues: [], error: "Agent did not create a branch" }, + mypy: { issues: [], error: "Agent did not create a branch" }, + langGraph: { + explanation: "", + error: "Agent did not create a branch", + }, + }, + }; + } - let programmerRun; - try { - programmerRun = await withRetry(() => - lgClient.runs.join( - programmerSession.threadId, - programmerSession.runId, - ), - ); - } catch (error) { - logger.error("Error joining programmer run", { - thread_id: threadId, - programmerSession, - error: - error instanceof Error - ? { - message: error.message, - stack: error.stack, - name: error.name, - cause: error.cause, - } - : error, + branchName = agentBranchName; + logger.info("Agent completed. Created branch:", { + branchName: branchName, }); - return; // instead of skipping, we should award 0 points - } - - const programmerState = programmerRun as unknown as GraphState; - const branchName = programmerState?.branchName; - - if (!branchName) { - logger.info("Agent did not create a branch", { - thread_id: threadId, + } else { + // Skip agent run - evaluate branch directly + branchName = inputs.branch || "main"; + logger.info("Skipping agent run. Evaluating branch:", { + branchName: branchName, }); - return; // instead of skipping, we should award 0 points } - logger.info("Agent completed. Created branch:", { - branchName: branchName, - }); - // Evaluation const wrappedEvaluator = ls.wrapEvaluator(evaluator); const evalResult = await wrappedEvaluator({ @@ -229,6 +302,10 @@ ls.describe(DATASET_NAME, () => { thread_id: threadId, evalResult, }); + return { + evalResult, + ...(RUN_AGENT_PIPELINE && { programmerRunUrl }), + }; }, 7200_000, ); diff --git a/apps/open-swe/evals/open-swe-types.ts b/apps/open-swe/evals/open-swe-types.ts index 01f27604d..fab3ca61c 100644 --- a/apps/open-swe/evals/open-swe-types.ts +++ b/apps/open-swe/evals/open-swe-types.ts @@ -21,6 +21,18 @@ export interface OpenSWEInput { * If not provided, agent will create one (e.g., "open-swe/uuid") */ branch: string; + + /** + * Optional: Test input for LangGraph evaluation + * Simple string input to pass to the agent + */ + test_input?: string; + + /** + * Optional: Ground truth for LangGraph evaluation + * Expected answer or result for comparison + */ + ground_truth?: string; } /** @@ -101,4 +113,8 @@ export interface CodeTestDetails { issues: string[]; error: Error | null; }; + langGraph: { + explanation: string; + error: Error | null; + }; } diff --git a/apps/open-swe/evals/prompts.ts b/apps/open-swe/evals/prompts.ts index 10fc1f975..ec1074fa6 100644 --- a/apps/open-swe/evals/prompts.ts +++ b/apps/open-swe/evals/prompts.ts @@ -36,18 +36,162 @@ export async function formatInputs( const readmeContents = await getRepoReadmeContents(targetRepository); - const SIMPLE_PROMPT_TEMPLATE = ` + const PROMPT_TEMPLATE = ` {USER_REQUEST} - {CODEBASE_README} -`; + + +CRITICAL: Your code will be evaluated by an automated LangGraph evaluation script. Follow these requirements EXACTLY or your submission will receive a score of 0.0: + +## REQUIRED FILE STRUCTURE +1. **agent.py** - Must be at the project root +2. **langgraph.json** - Must be at the project root + +## AGENT.PY REQUIREMENTS +Your agent.py file MUST: + +1. **Export the compiled graph correctly**: + \`\`\`python + # The evaluator looks for these attributes in order of preference: + app = your_compiled_graph # PREFERRED - consistent with other prompts + # OR (fallback options) + compiled_graph = your_compiled_graph + # OR + graph = your_compiled_graph + \`\`\` + +2. **Use the exact State schema format**: + \`\`\`python + from typing import TypedDict + from langchain_core.messages import HumanMessage + + class State(TypedDict): + messages: list # REQUIRED - evaluator sends input here + # ... add your other fields as needed + \`\`\` + +3. **Handle the evaluation input format**: + The evaluator will ALWAYS call your graph with ONLY: + \`\`\`python + {"messages": [HumanMessage(content="actual_user_input")]} + \`\`\` + + **IMPORTANT**: The evaluator provides ONLY user input - nothing else. Your code must: + - Extract user input from \`state["messages"][0].content\` + - Provide default values for ANY other state fields you need + - Never assume other state fields will be provided by the evaluator + + \`\`\`python + def your_first_node(state: State): + # Extract user input (the ONLY thing evaluator provides) + user_input = state["messages"][0].content if state["messages"] else "" + + # Initialize any other state fields with defaults if needed + current_step = state.get("current_step", 0) # Default to 0 + user_data = state.get("user_data", {}) # Default to empty dict + + # Your processing logic here... + + return { + "messages": state["messages"] + [AIMessage(content=response)], + "current_step": current_step + 1, # Update as needed + "user_data": updated_data # Update as needed + } + \`\`\` + +4. **Complete working example structure**: + \`\`\`python + from typing import Annotated + from typing_extensions import TypedDict + from langgraph.graph import StateGraph, START, END + from langgraph.graph.message import add_messages + from langchain_core.messages import HumanMessage, AIMessage + + class State(TypedDict): + # REQUIRED: messages field with add_messages reducer + messages: Annotated[list, add_messages] + # Add other fields as needed with default handling: + # current_step: int # Will default to missing, handle with .get() + # user_data: dict # Will default to missing, handle with .get() + + def process_input(state: State): + # Extract user input (ONLY thing evaluator provides) + user_input = state["messages"][0].content if state["messages"] else "" + + # Handle other state fields with defaults + step = state.get("current_step", 0) # Default to 0 + data = state.get("user_data", {}) # Default to empty dict + + # Your logic here + response = f"Processed: {user_input} (Step: {step})" + + # Return updates - add_messages will append the AI message + return { + "messages": [AIMessage(content=response)], + "current_step": step + 1, + "user_data": data # or updated data + } + + # Build graph + graph_builder = StateGraph(State) + graph_builder.add_node("process", process_input) + graph_builder.add_edge(START, "process") + graph_builder.add_edge("process", END) + + # REQUIRED: Export compiled graph (use 'app' for consistency) + app = graph_builder.compile() + \`\`\` + +## LANGGRAPH.JSON REQUIREMENTS +Create this exact file at project root: +\`\`\`json +{ + "dependencies": ["."], + "graphs": { + "agent": "./agent.py:app" + }, + "env": ".env" +} +\`\`\` + +## CRITICAL RESTRICTIONS +- **Evaluator provides ONLY user input** - The evaluator will ONLY provide \`{"messages": [HumanMessage(content="user_input")]}\`. Your code must handle all other state fields with default values using \`.get()\` or similar patterns. +- **NO print() statements** - They interfere with evaluation scoring +- **NO logging.info/debug/etc** - All logging is disabled during evaluation +- **NO sys.stdout writes** - Output is captured and affects scoring +- **Handle import errors gracefully** - Wrap imports in try/catch if needed +- **Test your graph locally** - Ensure \`compiled_graph.invoke({"messages": [HumanMessage(content="test")]})\` works + +## EVALUATION PROCESS +The evaluator will: +1. Import your module: \`importlib.import_module("agent")\` +2. Look for \`app\`, \`compiled_graph\`, or \`graph\` attribute +3. Call: \`your_graph.invoke({"messages": [HumanMessage(content="user_question")]})\` +4. Evaluate the output against criteria (relevance, completeness, accuracy, clarity) +5. The evaluator will install the dependencies from requirements.txt in your branch. Regardless of whether there's an existing requirements.txt from the branch you checked out, +you always need a complete requirements.txt file in your branch. + +**CRITICAL**: The evaluator provides ONLY the user's question in messages. Any other state fields your graph needs must have default values handled in your code. + +## OUTPUT FORMAT EXPECTATIONS +Your graph should return a State dict with: +- \`messages\` list containing the conversation +- Any other state fields you defined +- The final AI response should be in the last message + + +## FAILURE MODES TO AVOID +- Missing agent.py file → Score: 0.0 +- Missing app/compiled_graph/graph export → Score: 0.0 +- Import errors in agent.py → Score: 0.0 +- Graph doesn't handle the messages format → Score: 0.0 +- Print statements corrupting output → Reduced score + +Generate clean, working code that follows these requirements exactly.`; - const userMessageContent = SIMPLE_PROMPT_TEMPLATE.replace( - "{REPO}", - inputs.repo, - ) + const userMessageContent = PROMPT_TEMPLATE.replace("{REPO}", inputs.repo) .replace("{USER_REQUEST}", inputs.user_input) .replace("{CODEBASE_README}", readmeContents); diff --git a/apps/open-swe/evals/tests.ts b/apps/open-swe/evals/tests.ts index 0bd14acda..48114802c 100644 --- a/apps/open-swe/evals/tests.ts +++ b/apps/open-swe/evals/tests.ts @@ -1,6 +1,6 @@ -// TODO: Add ruff promise and the mypy promise to the tests. import { Sandbox } from "@daytonaio/sdk"; import { createLogger, LogLevel } from "../src/utils/logger.js"; +import { TIMEOUT_SEC } from "@openswe/shared/constants"; import { ExecOptions, RuffResult, @@ -130,3 +130,76 @@ export const runMyPyTypeCheck = async ( }; } }; + +/** + * Run LangGraph evaluation script + */ +export const runLangGraphEvaluation = async ( + sandbox: Sandbox, + args: ExecOptions, +): Promise<{ langGraphScore: number; explanation: string; error?: Error }> => { + logger.info("Running LangGraph evaluation..."); + + try { + const execution = await sandbox.process.executeCommand( + args.command, + args.workingDir, + args.env, + TIMEOUT_SEC * 3, + ); + + logger.info("LangGraph evaluation execution completed", { + exitCode: execution.exitCode, + outputLength: execution.result?.length || 0, + output: + execution.result?.substring(0, 1000) + + (execution.result?.length > 1000 ? "..." : ""), + }); + + if (execution.exitCode === 0) { + const outputLines = execution.result.trim().split("\n"); + const scoreStr = outputLines[0]; + const explanation = outputLines.slice(1).join(" "); + + const score = parseFloat(scoreStr); + if (isNaN(score)) { + logger.warn("Could not parse LangGraph evaluation score", { + output: execution.result, + }); + return { + langGraphScore: 0, + explanation: "Failed to parse evaluation score", + error: new Error(`Invalid score format: ${scoreStr}`), + }; + } + + logger.info("LangGraph evaluation completed successfully", { + score, + explanation, + }); + + return { + langGraphScore: score, + explanation, + }; + } else { + logger.error("LangGraph evaluation failed", { + exitCode: execution.exitCode, + output: execution.result, + }); + + return { + langGraphScore: 0, + explanation: "LangGraph evaluation failed", + error: new Error(execution.result || "Unknown evaluation error"), + }; + } + } catch (error) { + logger.error("Error running LangGraph evaluation", { error }); + return { + langGraphScore: 0, + explanation: "Error running LangGraph evaluation", + error: error as Error, + }; + } +}; diff --git a/apps/open-swe/evals/utils/dataset.ts b/apps/open-swe/evals/utils/dataset.ts new file mode 100644 index 000000000..cdf6ed48c --- /dev/null +++ b/apps/open-swe/evals/utils/dataset.ts @@ -0,0 +1,408 @@ +interface DatasetInput { + repo: string; + branch: string; + user_input: string; + test_input: string; + ground_truth: string; +} + +interface DatasetItem { + inputs: DatasetInput; +} + +export const DATASET: DatasetItem[] = [ + { + inputs: { + repo: "mai-sandbox/open-swe_write_ReAct_eval", + branch: "main", + user_input: ` +Hey, we need a basic chat assistant for our project. Nothing too crazy, just something that can chat with users and handle a couple of simple tasks. + +We want users to be able to: + - Have normal conversations with the assistant + - Ask it to search for stuff online when they need current info + - Get help with basic math calculations + +The assistant should be smart enough to know when to use tools vs just chat normally. + +Requirements: + - Use LangGraph for the main workflow (we're standardizing on this) + - Anthropic Claude for the LLM (we have API keys already) + - Keep it simple - this is a proof of concept + +Tools to implement: + 1. Search tool — for when users ask about current events, facts, etc. + 2. Calculator tool — for when they need math help + +The agent should be able to converse with the user, search for info using the web tool, and use the calculator tool for arithmetic. + +Other notes: + - Make sure it actually compiles and runs without errors + - Add type hints (our code standards require them) + - Handle errors gracefully — tools might fail sometimes + - Don't overthink the routing logic, simple is fine + +The goal is to have a working assistant that demonstrates LangGraph basics. We'll probably extend it later with more features. + `.trim(), + test_input: + "Search the web for the date when the Wells Fargo Center in Philadelphia first opened to the public, then calculate how many full years it has been open as of today (July 14, 2025), and finally summarize that in one sentence", + ground_truth: + "The Wells Fargo Center in Philadelphia first opened to the public on August 31, 1996, meaning it has been open for 28 full years as of July 14, 2025", + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_write_React_weather", + branch: "main", + user_input: ` +Hey, we need a really basic chatbot that can check the weather. + +What we want: + - Users should be able to chat normally with the bot + - Ask about weather in any city + - Get helpful weather information when they need it + +That's it! Keep it as simple as possible. + +Tool: + - Weather tool — looks up current weather for cities + +The agent should be able to talk to user and answer any question related to weather using the weather tool. + +Implementation notes: + - The bot should automatically detect weather questions + - Use the weather tool when needed, chat normally otherwise + - Keep the routing logic simple — if they mention a city + weather, use the tool + - Make sure it compiles and runs without errors + `.trim(), + test_input: "What's the weather in London right now?", + ground_truth: + "The weather in London is cloudy with a chance of rain, and about 58°F.", + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_ticket-Manager", + branch: "main", + user_input: ` +Hey, we need an advanced Support Ticket Triage agent built on LangGraph. + +What we want: + • Support staff will feed in raw customer tickets (text). + • The agent should perform these steps in order: + + 1. Classify the ticket into one of three buckets: "Billing", "Technical", or "General Inquiry". + 2. Detect Priority as "Low", "Medium", or "High" based on urgency clues. + 3. Summarize the ticket in one clear sentence. + 4. Route it to the correct email, using these rules: + - Billing + High → priority-billing@company.com + - Billing + else → billing@company.com + - Technical + High → urgent-tech@company.com + - Technical + else → tech@company.com + - General Inquiry → support@company.com + 5. Draft an Acknowledgement email snippet (1–2 sentences) referencing the summary and routing. + +Key features: + 1. Classification Node — LLM reads \`ticket_text\` → sets \`category\`. + `.trim(), + test_input: + "My internet connection drops every few hours—please help troubleshoot.", + ground_truth: + '{"category":"Technical","priority":"Medium","summary":"Customer\'s internet connection intermittently drops and they need troubleshooting.","route_to":"tech@company.com","ack_draft":"Thanks for reporting your connectivity issue. I\'ve routed your ticket to our technical team, and they\'ll work with you to resolve it."}', + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_news_trend_agent", + branch: "main", + user_input: ` +Hey, we need a Multi-Agent News-Trend Alert System built on LangGraph. + +What we want: + • Input: a topic string, e.g. "electric vehicles." + • The system invokes three agents in sequence — and each node returns its result as a string: + + 1. Fetcher Agent (\`fetcher\`) + - Takes \`state["topic"]\` and uses a news-fetch tool to get the top 5 headlines. + - Returns the headlines list serialized as a JSON string, and writes it into \`state["headlines_str"]\`. + + 2. Analyzer Agent (\`analyzer\`) + - Reads \`state["headlines_str"]\` (a JSON string), parses it, then finds words appearing ≥2 times. + - Returns the trends list serialized as a JSON string, and writes it into \`state["trends_str"]\`. + + 3. Reporter Agent (\`reporter\`) + - Reads \`state["trends_str"]\` and \`state["topic"]\`. + - If trends exist, creates this object: + { + "trend_found": true, + "trend_keywords": [...], + "alert_summary": "In the latest headlines on {topic}, we saw repeated mentions of {keywords}." + } + - Otherwise: + { + "trend_found": false, + "alert_summary": "No new trend detected for {topic}." + } + - Returns that entire object serialized as a single JSON string. + +Key features: + - State TypedDict: + class State(TypedDict): + topic: str + headlines_str: str # JSON string of List[str] + trends_str: str # JSON string of List[str] + `.trim(), + test_input: "Oscars 2025 winners", + ground_truth: + '{ "trend_found": true, "trend_keywords": ["Oppenheimer", "Barbie"], "alert_summary": "In the latest headlines on Oscars 2025 winners, we saw repeated mentions of Oppenheimer and Barbie." }', + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_finance_tracker", + branch: "main", + user_input: ` +Hey, we need a Personal Finance Tracker agent built on LangGraph. + +Requirements: + 1. The compiled graph's \`.invoke()\` must return \`state["summary_str"]\` as a JSON string. + 2. All node functions return dictionaries that update the state. + 3. The graph should start with this exact default state: + transactions: '[{"date": "2024-01-05", "description": "Whole Foods Market", "amount": 125.50}, {"date": "2024-01-12", "description": "Safeway Grocery", "amount": 99.50}, {"date": "2024-01-01", "description": "Monthly Rent Payment", "amount": 1005.00}, {"date": "2024-01-15", "description": "PG&E Electric Bill", "amount": 85.00}, {"date": "2024-01-20", "description": "Water Utility", "amount": 65.00}, {"date": "2024-01-08", "description": "Netflix Subscription", "amount": 15.99}, {"date": "2024-01-14", "description": "Movie Theater Tickets", "amount": 45.00}, {"date": "2024-01-22", "description": "Concert Tickets", "amount": 89.01}]' + category_budget: '{"Groceries":200,"Rent":1000,"Utilities":150,"Entertainment":100}' + categorized_str: "" + summary_str: "" + +Workflow (three nodes in sequence): + + 1. Categorizer Node (\`categorizer\`) + - Input: \`state["transactions"]\` (JSON string of transaction list) + - Action: Use LLM to assign each transaction a "category" field + - Category Rules: Map to exactly these categories: "Groceries", "Rent", "Utilities", "Entertainment", "Other" + - LLM Prompt Template: + Categorize these transactions into: Groceries, Rent, Utilities, Entertainment, or Other. + Return valid JSON with original fields plus 'category' field. + Transactions: {transactions_json} + - Return: {"categorized_str": "[{...transactions with category field...}]"} + - Error Handling: If JSON parsing fails, return original transactions string + + 2. Summarizer Node (\`summarizer\`) + - Input: \`state["categorized_str"]\` + - Action: Parse JSON and sum amounts per category + - Return: {"summary_str": '{"Groceries": 225.0, "Rent": 1005.0, ...}'} + - Error Handling: If parsing fails, return {"summary_str": "{}"} + + 3. Advisor Node (\`advisor\`) + - Input: \`state["summary_str"]\` and \`state["category_budget"]\` + - Action: Compare spending vs budget, generate advice ONLY for over-budget categories + - Advice Logic: + - If spent > budget: Generate LLM advice + - If spent <= budget: No advice for that category + - If no overspending: advice should be an empty object (advice: {}). + - LLM Prompt Template: + For each category where spending exceeds the budget, generate advice using this template: + "You overspent $ in (budget: $, spent: $). Provide one practical tip in 1-2 sentences to reduce spending." + - Final Return: Return a JSON string with both the category summary and advice, e.g.: + {"category_summary": {...}, "advice": {...}} + - Error Handling: If parsing fails, return a summary with an empty advice object. + +Output Format: + The final .invoke() return value must be a JSON string exactly like: + { + "category_summary": { + "Groceries": 225.0, + "Rent": 1005.0, + "Utilities": 150.0, + "Entertainment": 150.0 + }, + "advice": { + "Groceries": "Consider meal planning and buying generic brands.", + "Rent": "Look for ways to offset the slight overage.", + "Entertainment": "Set a stricter entertainment budget." + } + } + +Implementation Notes: + - Use init_chat_model("anthropic:claude-3-5-sonnet-latest") for LLM calls + - All state values are JSON strings, parse with json.loads() + - Graph flow: START → categorizer → summarizer → advisor → END + - Export as compiled_graph = graph_builder.compile() + `.trim(), + test_input: "Show me my monthly spending overview", + ground_truth: + '{"category_summary": {"Groceries": 225.0, "Rent": 1005.0, "Utilities": 150.0, "Entertainment": 150.0}, "advice": {"Groceries": "Consider meal planning and buying generic brands to reduce grocery costs.", "Rent": "Look for ways to reduce utilities or consider a roommate to offset the slight rent overage.", "Entertainment": "Set a stricter entertainment budget and look for free or low-cost activities."}}', + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_finance_tracker", + branch: "main", + user_input: ` +Hey, we need a Personal Finance Tracker agent built on LangGraph. + +Requirements: + 1. The compiled graph's \`.invoke()\` must return \`state["summary_str"]\` as a JSON string. + 2. All node functions return dictionaries that update the state. + 3. The graph should start with this exact default state: + transactions: '[{"date": "2024-01-05", "description": "Whole Foods Market", "amount": 125.50}, {"date": "2024-01-12", "description": "Safeway Grocery", "amount": 99.50}, {"date": "2024-01-01", "description": "Monthly Rent Payment", "amount": 1005.00}, {"date": "2024-01-15", "description": "PG&E Electric Bill", "amount": 85.00}, {"date": "2024-01-20", "description": "Water Utility", "amount": 65.00}, {"date": "2024-01-08", "description": "Netflix Subscription", "amount": 15.99}, {"date": "2024-01-14", "description": "Movie Theater Tickets", "amount": 45.00}, {"date": "2024-01-22", "description": "Concert Tickets", "amount": 89.01}]' + category_budget: '{"Groceries":200,"Rent":1000,"Utilities":150,"Entertainment":100}' + categorized_str: "" + summary_str: "" + +Workflow (three nodes in sequence): + + 1. Categorizer Node (\`categorizer\`) + - Input: \`state["transactions"]\` (JSON string of transaction list) + - Action: Use LLM to assign each transaction a "category" field + - Category Rules: Map to exactly these categories: "Groceries", "Rent", "Utilities", "Entertainment", "Other" + - LLM Prompt Template: + Categorize these transactions into: Groceries, Rent, Utilities, Entertainment, or Other. + Return valid JSON with original fields plus 'category' field. + Transactions: {transactions_json} + - Return: {"categorized_str": "[{...transactions with category field...}]"} + - Error Handling: If JSON parsing fails, return original transactions string + + 2. Summarizer Node (\`summarizer\`) + - Input: \`state["categorized_str"]\` + - Action: Parse JSON and sum amounts per category + - Return: {"summary_str": '{"Groceries": 225.0, "Rent": 1005.0, ...}'} + - Error Handling: If parsing fails, return {"summary_str": "{}"} + + 3. Advisor Node (\`advisor\`) + - Input: \`state["summary_str"]\` and \`state["category_budget"]\` + - Action: Compare spending vs budget, generate advice ONLY for over-budget categories + - Advice Logic: + - If spent > budget: Generate LLM advice + - If spent <= budget: No advice for that category + - If no overspending: "advice": {} + - LLM Prompt Template: + You overspent in (budget: , spent: ). + Provide one practical tip in 1-2 sentences to reduce spending. + - Final Return: {"summary_str": '{"category_summary": {...}, "advice": {...}}'} + - Error Handling: If parsing fails, return basic summary without advice + +Output Format: + The final .invoke() return value must be a JSON string exactly like: + { + "category_summary": { + "Groceries": 225.0, + "Rent": 1005.0, + "Utilities": 150.0, + "Entertainment": 150.0 + }, + "advice": { + "Groceries": "Consider meal planning and buying generic brands.", + "Rent": "Look for ways to offset the slight overage.", + "Entertainment": "Set a stricter entertainment budget." + } + } + +Implementation Notes: + - Use init_chat_model("anthropic:claude-3-5-sonnet-latest") for LLM calls + - All state values are JSON strings, parse with json.loads() + - Graph flow: START → categorizer → summarizer → advisor → END + - Export as compiled_graph = graph_builder.compile() + `.trim(), + test_input: "Generate personal finance report", + ground_truth: + '{"category_summary": {"Groceries": 225.0, "Rent": 1005.0, "Utilities": 150.0, "Entertainment": 150.0}, "advice": {"Groceries": "Try shopping with a list and comparing prices to stay within your grocery budget.", "Rent": "Consider negotiating with your landlord or finding additional income to cover the rent overage.", "Entertainment": "Track entertainment spending more carefully and prioritize free activities."}}', + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_travel_agent", + branch: "main", + user_input: ` +Build a comprehensive Travel Itinerary Planning Agent using LangGraph from scratch that can create detailed, cost-optimized travel plans with real-time pricing and logistics. + +Requirements: + +Multi-Tool Integration: + - flight_searcher: Finds and compares flight options with real pricing + - hotel_finder: Searches accommodations by budget, location, and dates + - route_optimizer: Plans efficient daily itineraries and transportation + - weather_checker: Provides weather forecasts for travel dates + - budget_calculator: Tracks expenses and optimizes spending + - currency_converter: Handles real-time currency exchange rates + - visa_checker: Verifies visa requirements and travel documents + +Conditional Routing Logic: + The agent should intelligently route between: + - Direct recommendations for simple queries (common destinations, general advice) + - Single tool usage for specific requests (just flights, just hotels) + - Multi-tool workflows for complete itinerary planning + - Budget optimization workflows that balance cost vs. preferences + +Stateful Memory: + Implement persistent memory to: + - Store user preferences (budget, travel style, dietary restrictions) + - Remember previous searches and pricing data + - Track multi-step planning progress + - Maintain currency and date context throughout conversation + +Workflow Types: + - Simple Queries: Basic travel advice without real-time data needs + - Price Check Workflows: Current pricing for flights, hotels, activities + - Full Planning Workflows: Complete itineraries with optimization + - Budget Analysis: Cost breakdowns and savings recommendations + +Agent Architecture: + - Use LangGraph's StateGraph with travel-specific state schema + - Implement conditional edges for intelligent tool selection + - Add budget constraint checking and optimization loops + - Include error handling for unavailable dates/destinations + +Expected Implementation: + - Custom state schema with travel dates, budget, preferences, and pricing data + - Tool definitions with proper input/output schemas and real-time API integration + - Conditional routing function that decides between tools based on query complexity + - Main planning workflow with cycles for optimization and user feedback + - Memory checkpointing for session persistence across multi-day planning + `.trim(), + test_input: + "Plan a 5-day trip to Tokyo from San Francisco for October 15-20, 2025. Budget is $2500 total for 1 person. I want mid-range accommodations and to visit major attractions.", + ground_truth: + "Should use multiple tools in sequence: flight_searcher ($784 round-trip SFO-Tokyo), hotel_finder ($143/night x 5 nights = $715), route_optimizer (Tokyo Skytree $18, DisneySea $65, Senso-ji free), budget_calculator (flights $784 + hotels $715 + food $350 + transport $70 + activities $150 = $2069, under budget), weather_checker (October Tokyo: 65°F, mild). Should provide day-by-day itinerary with specific costs and recommendations to stay within $2500 budget.", + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_edit_task_1a", + branch: "main", + user_input: ` +I have a LangGraph React agent that I want to enhance with web search capabilities. Please add Tavily search functionality to this agent so it can search the web for current information and provide up-to-date responses. + +Requirements: + - Add Tavily search tool integration + - Configure it to return 3 results with advanced search depth + - Use environment variables for API keys (assume they will be in .env) + - Maintain the existing conversation memory functionality + `.trim(), + test_input: + "Who won the latest NBA championship and what were the final standings?", + ground_truth: + "The Oklahoma City Thunder (seeded 1st in the Western Conference) defeated the Indiana Pacers (seeded 4th in the Eastern Conference) 4-3 in the 2025 NBA Finals", + }, + }, + { + inputs: { + repo: "mai-sandbox/open-swe_edit_task_1b", + branch: "main", + user_input: ` +I have a LangGraph agent that has tools available but the workflow isn't routing correctly. The agent needs conditional logic to decide when to use tools versus when to provide a final answer directly. + +Currently, the agent goes straight from the agent node to END, which means it never uses its available tools even when they would be helpful. + +Please add conditional routing logic so the agent can: + - Use tools when the agent decides tool calls are needed + - Provide direct answers when no tools are required + - Properly cycle between agent decisions and tool usage + +The agent should be able to handle queries that need weather information, math calculations, or knowledge searches, as well as simple conversational queries that don't need tools. + `.trim(), + test_input: "Compare the weather in London and Tokyo right now", + ground_truth: + "In London, the current temperature is 66°F (19°C), feeling like 77°F (25°C), with cloudy skies and a 15% chance of rain, while Tokyo is experiencing light rain showers and a temperature of 80°F (27°C). (07/15/25)", + }, + }, +]; diff --git a/apps/open-swe/evals/utils/retry.ts b/apps/open-swe/evals/utils/retry.ts index bffe42f1f..1d475e8be 100644 --- a/apps/open-swe/evals/utils/retry.ts +++ b/apps/open-swe/evals/utils/retry.ts @@ -3,7 +3,7 @@ import { createLogger, LogLevel } from "../../src/utils/logger.js"; const logger = createLogger(LogLevel.DEBUG, "Retry"); const RETRY_CONFIG = { - maxRetries: 5, + maxRetries: 15, baseDelay: 1000, maxDelay: 30000, backoffMultiplier: 2, From 79e8637101d15c3fb2556d17895203cdf7fa1917 Mon Sep 17 00:00:00 2001 From: aliyanishfaq Date: Mon, 8 Sep 2025 17:28:13 -0700 Subject: [PATCH 2/2] chore: dataset removed & cod cleaning --- apps/open-swe/evals/langgraph.eval.ts | 47 +++++++++++----------- apps/open-swe/evals/utils/langsmith-url.ts | 17 ++++++++ 2 files changed, 40 insertions(+), 24 deletions(-) create mode 100644 apps/open-swe/evals/utils/langsmith-url.ts diff --git a/apps/open-swe/evals/langgraph.eval.ts b/apps/open-swe/evals/langgraph.eval.ts index 13fd03005..fc6eacf44 100644 --- a/apps/open-swe/evals/langgraph.eval.ts +++ b/apps/open-swe/evals/langgraph.eval.ts @@ -2,6 +2,7 @@ import { v4 as uuidv4 } from "uuid"; import * as ls from "langsmith/vitest"; +import { Client, Example } from "langsmith"; import { formatInputs } from "./prompts.js"; import { createLogger, LogLevel } from "../src/utils/logger.js"; import { evaluator } from "./evaluator.js"; @@ -12,41 +13,38 @@ import { ManagerGraphState } from "@openswe/shared/open-swe/manager/types"; import { PlannerGraphState } from "@openswe/shared/open-swe/planner/types"; import { GraphState } from "@openswe/shared/open-swe/types"; import { withRetry } from "./utils/retry.js"; -import { DATASET } from "./utils/dataset.js"; +import { OpenSWEInput } from "./open-swe-types.js"; +import { createProgrammerRunURL } from "./utils/langsmith-url.js"; +//import { DATASET } from "./utils/dataset.js"; const logger = createLogger(LogLevel.DEBUG, "Evaluator"); // Configuration constants const RUN_AGENT_PIPELINE = process.env.RUN_AGENT_PIPELINE === "true" || true; -let programmerRunUrl = - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/eba94921-7f40-4be0-b153-e88ab6fdcfdd/r/"; const DATASET_NAME = process.env.DATASET_NAME || "aliyan-open-swe-langgraph-eval"; -// const RUN_NAME = `${DATASET_NAME}-${new Date().toISOString().replace(/[:.]/g, '-')}`; -// async function loadDataset(): Promise { -// const client = new LangSmithClient(); -// const datasetStream = client.listExamples({ datasetName: DATASET_NAME }); -// let examples: Example[] = []; -// for await (const example of datasetStream) { -// examples.push(example); -// } -// logger.info( -// `Loaded ${examples.length} examples from dataset "${DATASET_NAME}"`, -// ); -// return examples; -// } +async function loadDataset(): Promise { + const client = new Client(); + const datasetStream = client.listExamples({ datasetName: DATASET_NAME }); + const examples: Example[] = []; + for await (const example of datasetStream) { + examples.push(example); + } + logger.info( + `Loaded ${examples.length} examples from dataset "${DATASET_NAME}"`, + ); + return examples; +} -// const DATASET = await loadDataset().then((examples) => -// examples.map(example => ({ -// inputs: example.inputs as OpenSWEInput, -// })), -// ); +const DATASET = await loadDataset().then((examples) => + examples.slice(0, 1).map((example) => ({ + inputs: example.inputs as OpenSWEInput, + })), +); logger.info(`Starting evals over ${DATASET.length} examples...`); -//const LANGGRAPH_URL = process.env.LANGGRAPH_URL || "http://localhost:2024"; - ls.describe(DATASET_NAME, () => { ls.test.each(DATASET)( "Can resolve issue", @@ -81,6 +79,7 @@ ls.describe(DATASET_NAME, () => { }); let branchName: string; + let programmerRunUrl: string | undefined; if (RUN_AGENT_PIPELINE) { logger.info("Running full agent pipeline..."); @@ -214,7 +213,7 @@ ls.describe(DATASET_NAME, () => { } let programmerRun; - programmerRunUrl = `${programmerRunUrl}${programmerSession.runId}`; + programmerRunUrl = createProgrammerRunURL(programmerSession.runId); try { programmerRun = await withRetry(() => lgClient.runs.join( diff --git a/apps/open-swe/evals/utils/langsmith-url.ts b/apps/open-swe/evals/utils/langsmith-url.ts new file mode 100644 index 000000000..7b152f4eb --- /dev/null +++ b/apps/open-swe/evals/utils/langsmith-url.ts @@ -0,0 +1,17 @@ +/** + * Creates a LangSmith run URL for tracking programmer runs + * @param runId - The specific run ID to link to + * @returns Complete LangSmith URL for the run + */ +export function createProgrammerRunURL(runId: string): string { + const workspaceId = process.env.LANGSMITH_WORKSPACE_ID; + const projectId = process.env.LANGSMITH_PROJECT_ID; + + if (!workspaceId || !projectId) { + throw new Error( + "LANGSMITH_WORKSPACE_ID and LANGSMITH_PROJECT_ID environment variables are required", + ); + } + + return `https://smith.langchain.com/o/${workspaceId}/projects/p/${projectId}/r/${runId}`; +}