Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 224 additions & 11 deletions apps/open-swe/evals/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,167 @@ import { TargetRepository } from "@openswe/shared/open-swe/types";
import { cloneRepo } from "../src/utils/github/git.js";
import { getRepoAbsolutePath } from "@openswe/shared/git";
import { SimpleEvaluationResult } from "langsmith/vitest";
import { runRuffLint, runMyPyTypeCheck } from "./tests.js";
import { setupEnv, ENV_CONSTANTS } from "../src/utils/env-setup.js";
import {
runRuffLint,
runMyPyTypeCheck,
runLangGraphEvaluation,
} from "./tests.js";
import { writeFile } from "../src/utils/read-write.js";
import * as fs from "fs";
import * as path from "path";

const logger = createLogger(LogLevel.INFO, "Evaluator ");

// Use shared constants from env-setup utility
const { RUN_PYTHON_IN_VENV } = ENV_CONSTANTS;
const VENV_PATH = ".venv";
const RUN_PYTHON_IN_VENV = `${VENV_PATH}/bin/python`;
const RUN_PIP_IN_VENV = `${VENV_PATH}/bin/pip`;

/**
* Setup Python environment with requirements.txt + ruff + mypy
*/
async function setupEnv(
sandbox: Sandbox,
absoluteRepoDir: string,
envVars?: Record<string, string>,
): Promise<boolean> {
logger.info("Setting up Python environment...");

const createVenvCommand = "python -m venv .venv";
const createVenvRes = await sandbox.process.executeCommand(
createVenvCommand,
absoluteRepoDir,
envVars,
TIMEOUT_SEC,
);
if (createVenvRes.exitCode !== 0) {
logger.error("Failed to create virtual environment", {
createVenvCommand,
createVenvRes,
});
return false;
}

const upgradePipRes = await sandbox.process.executeCommand(
`${RUN_PIP_IN_VENV} install --upgrade pip`,
absoluteRepoDir,
envVars,
TIMEOUT_SEC,
);
if (upgradePipRes.exitCode !== 0) {
logger.warn("Failed to upgrade pip, continuing anyway", { upgradePipRes });
}

const requirementsExistRes = await sandbox.process.executeCommand(
"test -f requirements.txt",
absoluteRepoDir,
envVars,
TIMEOUT_SEC,
);

if (requirementsExistRes.exitCode === 0) {
logger.info("Found requirements.txt, installing...");
const installReqRes = await sandbox.process.executeCommand(
`${RUN_PIP_IN_VENV} install -r requirements.txt`,
absoluteRepoDir,
envVars,
TIMEOUT_SEC * 3,
);
if (installReqRes.exitCode !== 0) {
logger.warn("Failed to install requirements.txt, continuing anyway", {
installReqRes,
});
}
} else {
logger.info("No requirements.txt found, skipping repository dependencies");
}

// Install evaluation-specific dependencies
logger.info("Installing evaluation dependencies...");
const installEvalDepsRes = await sandbox.process.executeCommand(
`${RUN_PIP_IN_VENV} install langchain langchain-core langchain-openai pydantic openai`,
absoluteRepoDir,
envVars,
TIMEOUT_SEC * 2,
);
if (installEvalDepsRes.exitCode !== 0) {
logger.warn(
"Failed to install evaluation dependencies, continuing anyway",
{
installEvalDepsRes,
},
);
}

const installAnalysisToolsRes = await sandbox.process.executeCommand(
`${RUN_PIP_IN_VENV} install ruff mypy`,
absoluteRepoDir,
envVars,
TIMEOUT_SEC,
);
if (installAnalysisToolsRes.exitCode !== 0) {
logger.error("Failed to install ruff and mypy", {
installAnalysisToolsRes,
});
return false;
}

logger.info("Copying LangGraph evaluation script to sandbox...");
try {
const evalScriptPath = path.join(
__dirname,
"scripts",
"langgraph_check.py",
);
const evalScriptContent = fs.readFileSync(evalScriptPath, "utf8");

const { success: copyScriptSuccess, output: copyScriptOutput } =
await writeFile({
sandbox,
filePath: "langgraph_check.py",
content: evalScriptContent,
workDir: absoluteRepoDir,
});

if (!copyScriptSuccess) {
logger.warn("Failed to copy LangGraph evaluation script", {
copyScriptOutput,
});
} else {
logger.info("Successfully copied LangGraph evaluation script to sandbox");
}
} catch (error) {
logger.warn("Error copying LangGraph evaluation script", { error });
}

logger.info("Environment setup completed successfully");
return true;
}

/**
* Runs ruff and mypy analysis on all Python files in the repository
*/
async function runCodeTests(
sandbox: Sandbox,
absoluteRepoDir: string,
): Promise<{ ruffScore: number; mypyScore: number; details: CodeTestDetails }> {
openSWEInputs: OpenSWEInput,
envVars?: Record<string, string>,
): Promise<{
ruffScore: number;
mypyScore: number;
langGraphScore: number;
details: CodeTestDetails;
}> {
logger.info("Running code analysis on all Python files in repository");

const testResults: {
ruffScore: number;
mypyScore: number;
langGraphScore: number;
details: CodeTestDetails;
} = {
ruffScore: 0,
mypyScore: 0,
langGraphScore: 0,
details: {
ruff: {
issues: [],
Expand All @@ -41,27 +178,38 @@ async function runCodeTests(
issues: [],
error: null,
},
langGraph: {
explanation: "",
error: null,
},
},
};

const [ruffLint, mypyCheck] = await Promise.all([
const [ruffLint, mypyCheck, langGraphEval] = await Promise.all([
runRuffLint(sandbox, {
command: `${RUN_PYTHON_IN_VENV} -m ruff check . --output-format=json`,
workingDir: absoluteRepoDir,
env: undefined,
env: envVars,
timeoutSec: TIMEOUT_SEC * 3,
}),
runMyPyTypeCheck(sandbox, {
command: `${RUN_PYTHON_IN_VENV} -m mypy . --no-error-summary --show-error-codes --no-color-output`,
workingDir: absoluteRepoDir,
env: undefined,
env: envVars,
timeoutSec: TIMEOUT_SEC * 3,
}),
runLangGraphEvaluation(sandbox, {
command: `${RUN_PYTHON_IN_VENV} -m langgraph_check agent.py -i "${openSWEInputs.test_input}" -g "${openSWEInputs.ground_truth}"`,
workingDir: absoluteRepoDir,
env: envVars,
timeoutSec: TIMEOUT_SEC * 3,
}),
]);

Object.assign(testResults, {
ruffScore: ruffLint.ruffScore,
mypyScore: mypyCheck.mypyScore,
langGraphScore: langGraphEval.langGraphScore,
details: {
ruff: {
issues: ruffLint.issues,
Expand All @@ -71,14 +219,20 @@ async function runCodeTests(
issues: mypyCheck.issues,
error: mypyCheck.error,
},
langGraph: {
explanation: langGraphEval.explanation,
error: langGraphEval.error,
},
},
});

logger.info("Code tests completed", {
ruffScore: testResults.ruffScore,
mypyScore: testResults.mypyScore,
langGraphScore: testResults.langGraphScore,
ruffIssues: testResults.details.ruff.issues.length,
mypyIssues: testResults.details.mypy.issues.length,
langGraphExplanation: testResults.details.langGraph.explanation,
});

return testResults;
Expand Down Expand Up @@ -120,7 +274,49 @@ export async function evaluator(inputs: {

const absoluteRepoDir = getRepoAbsolutePath(output.targetRepository);

const envSetupSuccess = await setupEnv(sandbox, absoluteRepoDir);
const envVars: Record<string, string> = {};
const apiKeys = [
"OPENAI_API_KEY",
"ANTHROPIC_API_KEY",
"LANGCHAIN_API_KEY",
"LANGCHAIN_TRACING_V2",
"LANGCHAIN_PROJECT",
"GOOGLE_API_KEY",
"TAVILY_API_KEY",
];

apiKeys.forEach((key) => {
if (process.env[key]) {
envVars[key] = process.env[key]!;
logger.info(`Added environment variable: ${key}`);
}
});

logger.info(
`Syncing to latest state of solution branch: ${solutionBranch}`,
);

const updateBranchRes = await sandbox.process.executeCommand(
`git fetch origin ${solutionBranch} && git checkout -B ${solutionBranch} FETCH_HEAD`,
absoluteRepoDir,
envVars,
TIMEOUT_SEC,
);

if (updateBranchRes.exitCode !== 0) {
logger.error("Failed to update solution branch", {
solutionBranch,
updateResult: updateBranchRes,
});
throw new Error(`Failed to update solution branch: ${solutionBranch}`);
}

logger.info("Git update result:", {
exitCode: updateBranchRes.exitCode,
result: updateBranchRes.result,
});

const envSetupSuccess = await setupEnv(sandbox, absoluteRepoDir, envVars);
if (!envSetupSuccess) {
logger.error("Failed to setup environment");
return [
Expand All @@ -131,14 +327,24 @@ export async function evaluator(inputs: {
];
}

const analysisResult = await runCodeTests(sandbox, absoluteRepoDir);
const analysisResult = await runCodeTests(
sandbox,
absoluteRepoDir,
openSWEInputs,
envVars,
);

const overallScore = analysisResult.ruffScore + analysisResult.mypyScore;
const overallScore =
analysisResult.ruffScore +
analysisResult.mypyScore +
analysisResult.langGraphScore;

logger.info("Evaluation completed", {
overallScore,
ruffScore: analysisResult.ruffScore,
mypyScore: analysisResult.mypyScore,
langGraphScore: analysisResult.langGraphScore,
langGraphExplanation: analysisResult.details.langGraph.explanation,
repo: openSWEInputs.repo,
originalBranch: openSWEInputs.branch,
solutionBranch,
Expand All @@ -152,10 +358,17 @@ export async function evaluator(inputs: {
{
key: "ruff-score",
score: analysisResult.ruffScore,
comment: analysisResult.details.ruff.issues.join("\n"),
},
{
key: "mypy-score",
score: analysisResult.mypyScore,
comment: analysisResult.details.mypy.issues.join("\n"),
},
{
key: "langgraph-score",
score: analysisResult.langGraphScore,
comment: analysisResult.details.langGraph.explanation,
},
];
} catch (error) {
Expand Down
Loading
Loading