Merge branch 'main' into feat/add-kyrgyz-bench

NathanHB · web-flow · commit 731251c95486 · 2025-11-21T14:18:55.000+01:00
diff --git a/.github/workflows/pr_style_bot.yaml b/.github/workflows/pr_style_bot.yaml
@@ -0,0 +1,87 @@
+name: PR Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  run-style-bot:
+    if: >
+      contains(github.event.comment.body, '@bot /style') &&
+      github.event.issue.pull_request != null
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Extract PR details
+        id: pr_info
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = context.payload.issue.number;
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: prNumber
+            });
+
+            // We capture both the branch ref and the "full_name" of the head repo
+            // so that we can check out the correct repository & branch (including forks).
+            core.setOutput("prNumber", prNumber);
+            core.setOutput("headRef", pr.head.ref);
+            core.setOutput("headRepoFullName", pr.head.repo.full_name);
+      - name: Check out PR branch
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ steps.pr_info.outputs.headRepoFullName }}
+          ref: ${{ steps.pr_info.outputs.headRef }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+
+      - name: Install dependencies
+        run: pip install .[quality]
+
+      - name: Apply style fixes
+        run: |
+          ruff format .
+          ruff check --fix .
+
+      - name: Commit and push changes
+        id: commit_and_push
+        env:
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
+          if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Apply style fixes"
+            git push origin HEAD:${{ env.HEADREF }}
+            echo "changes_pushed=true" >> $GITHUB_OUTPUT
+          else
+            echo "changes_pushed=false" >> $GITHUB_OUTPUT
+          fi
+      - name: Comment on PR
+        if: steps.commit_and_push.outputs.changes_pushed == 'true'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = ${{ steps.pr_info.outputs.prNumber }};
+            const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body: `Style fixes have been applied. [View the workflow run here](${runUrl}).`
+            });
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
@@ -155,7 +155,6 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 task = LightevalTaskConfig(
     name="my_custom_task",
-    suite=["community"],
     metric=[my_custom_metric],  # Use your custom metric here
     prompt_function=my_prompt_function,
     hf_repo="my_dataset",
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,9 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 
+[tool.setuptools.package-data]
+lighteval = ["py.typed"]
+
 [project]
 name = "lighteval"
 version = "0.11.1.dev0"
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -71,6 +71,7 @@
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_custom.custom)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.eval)
+app.command(rich_help_panel="EvaluationUtils")(lighteval.main_inspect.bundle)
 app.add_typer(
     lighteval.main_endpoint.app,
     name="endpoint",
diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py
@@ -30,6 +30,7 @@
 from inspect_ai import Epochs, Task, task
 from inspect_ai import eval_set as inspect_ai_eval_set
 from inspect_ai.dataset import hf_dataset
+from inspect_ai.log import bundle_log_dir
 from inspect_ai.scorer import exact
 from inspect_ai.solver import generate, system_message
 from pytablewriter import MarkdownTableWriter
@@ -519,6 +520,13 @@ def eval(  # noqa C901
         print("run 'inspect view' to view the results")
 
 
+def bundle(log_dir: str, output_dir: str, overwrite: bool = True, repo_id: str | None = None, public: bool = False):
+    bundle_log_dir(log_dir=log_dir, output_dir=output_dir, overwrite=overwrite)
+
+    if repo_id is not None:
+        push_to_hub(output_dir, repo_id, public=public)
+
+
 if __name__ == "__main__":
     task = "lighteval|gsm8k|5,lighteval|gsm8k|1,lighteval|gsm8k|0"
     task = "lighteval|agieval|0"
diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
@@ -66,6 +66,9 @@
 
 STARTING_BATCH_SIZE = 512
 
+# Thread local param
+torch.set_grad_enabled(False)
+
 
 class TransformersModelConfig(ModelConfig):
     """Configuration class for HuggingFace Transformers models.
@@ -218,12 +221,6 @@ def __init__(
         if config.model_parallel is False and self.config.dtype not in ["4bit", "8bit"]:
             logger.info(f"Using Data Parallelism, putting model on device {self._device}")
             self.model = self.model.to(self._device)
-        if config.compile:
-            try:
-                logger.info("Compiling the model")
-                self.model.model.compile()
-            except AttributeError as e:
-                logger.warning("Could not compile the model because: ", e)
 
         self.model_name = _simplify_name(config.model_name)
 
@@ -410,7 +407,7 @@ def _create_auto_model(self) -> transformers.PreTrainedModel:
         )
         # model.to(self.device)
         model.eval()
-        torch.set_grad_enabled(False)
+
         if self.continuous_batching:
             generation_config = GenerationConfig(
                 **self.generation_config_dict,
@@ -497,9 +494,6 @@ def _check_continuations_start_space(self, continuation: str) -> str:
                 continuation = continuation.lstrip()
         return continuation
 
-    def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
-        return self.model(inputs).logits
-
     def _get_batch_size(self, max_input_length: int, override_bs: int | None, starting_batch_size: int = 512) -> int:
         if override_bs is not None:
             return override_bs
@@ -509,10 +503,18 @@ def _get_batch_size(self, max_input_length: int, override_bs: int | None, starti
             starting_batch_size=starting_batch_size
         )  # if OOM, then halves batch_size and tries again
         def forward_batch(batch_size):
-            test_batch = torch.ones(
-                (batch_size + int(0.1 * batch_size), max_input_length), device=self.device
-            ).long()  # We add 10% for marging :)
-            F.log_softmax(self._model_call(test_batch).float(), dim=-1).cpu()
+            fake_batch, fake_output = None, None
+            with torch.no_grad():
+                try:
+                    fake_batch = torch.ones((batch_size, max_input_length), device=self.device).int()
+                    fake_output = F.log_softmax(self.model(fake_batch).logits, dim=-1).cpu()
+                except Exception as e:
+                    for fake_item in [fake_batch, fake_output]:
+                        if fake_item is not None:
+                            fake_item.detach()
+                            del fake_item
+
+                    raise e
             return batch_size
 
         batch_size = forward_batch()
@@ -645,10 +647,14 @@ def _padded_greedy_until(
             position=0,
             disable=self.disable_tqdm,
         ):
-            if split[0].generation_size is None:
+            if self.generation_config_dict.get("max_new_tokens", None) is not None:
+                # The user forces a specific generation size
+                max_context_continuation_size_allowed = self.generation_config_dict["max_new_tokens"]
+            elif split[0].generation_size is None:
                 # No constraints on the generation size: max length allowed is the max model context
                 max_context_continuation_size_allowed = self.max_length
             else:
+                # The task forces a specific generation size
                 context = self.prompt_manager.prepare_prompt(split[0])
                 tokenized_context = self.tokenizer(context)
 
@@ -953,7 +959,7 @@ def _loglikelihood_tokens(  # noqa: C901
                     max_context=None,  # computed as model max length in the function
                 )
 
-                model_output = self._model_call(prepared_batch.input_ids)
+                model_output = self.model(prepared_batch.input_ids).logits
                 logits = F.log_softmax(model_output, dim=-1)  # [batch, sequence_length, vocab]
 
                 flat_index = 0
diff --git a/src/lighteval/py.typed b/src/lighteval/py.typed
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -60,7 +60,7 @@ class LightevalTaskConfig:
             name as input.
         hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
         hf_subset (str): Dataset subset/configuration name to use for this task.
-        metrics (ListLike[Metric]): List of metrics to compute for this task.
+        metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
 
     Dataset Configuration:
         hf_revision (str | None, optional): Specific dataset revision to use.
@@ -112,7 +112,7 @@ class LightevalTaskConfig:
     ]  # The prompt function should be used to map a line in the dataset to a Sample
     hf_repo: str
     hf_subset: str
-    metrics: ListLike[Metric]  # List of metric , should be configurable
+    metrics: ListLike[Metric | Metrics]  # Accept both Metric objects and Metrics enums
 
     # Inspect AI compatible parameters
     solver: None = None