Skip to content

Commit 731251c

Browse files
authored
Merge branch 'main' into feat/add-kyrgyz-bench
2 parents 63d7541 + 7f50228 commit 731251c

File tree

8 files changed

+123
-19
lines changed

8 files changed

+123
-19
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
name: PR Style Bot
2+
3+
on:
4+
issue_comment:
5+
types: [created]
6+
7+
permissions:
8+
contents: write
9+
pull-requests: write
10+
11+
jobs:
12+
run-style-bot:
13+
if: >
14+
contains(github.event.comment.body, '@bot /style') &&
15+
github.event.issue.pull_request != null
16+
17+
runs-on: ubuntu-latest
18+
19+
steps:
20+
- name: Extract PR details
21+
id: pr_info
22+
uses: actions/github-script@v6
23+
with:
24+
script: |
25+
const prNumber = context.payload.issue.number;
26+
const { data: pr } = await github.rest.pulls.get({
27+
owner: context.repo.owner,
28+
repo: context.repo.repo,
29+
pull_number: prNumber
30+
});
31+
32+
// We capture both the branch ref and the "full_name" of the head repo
33+
// so that we can check out the correct repository & branch (including forks).
34+
core.setOutput("prNumber", prNumber);
35+
core.setOutput("headRef", pr.head.ref);
36+
core.setOutput("headRepoFullName", pr.head.repo.full_name);
37+
- name: Check out PR branch
38+
uses: actions/checkout@v3
39+
with:
40+
repository: ${{ steps.pr_info.outputs.headRepoFullName }}
41+
ref: ${{ steps.pr_info.outputs.headRef }}
42+
fetch-depth: 0
43+
token: ${{ secrets.GITHUB_TOKEN }}
44+
45+
- name: Set up Python
46+
uses: actions/setup-python@v4
47+
48+
- name: Install dependencies
49+
run: pip install .[quality]
50+
51+
- name: Apply style fixes
52+
run: |
53+
ruff format .
54+
ruff check --fix .
55+
56+
- name: Commit and push changes
57+
id: commit_and_push
58+
env:
59+
HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
60+
HEADREF: ${{ steps.pr_info.outputs.headRef }}
61+
PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
62+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
63+
run: |
64+
git config user.name "github-actions[bot]"
65+
git config user.email "github-actions[bot]@users.noreply.github.com"
66+
git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
67+
if [ -n "$(git status --porcelain)" ]; then
68+
git add .
69+
git commit -m "Apply style fixes"
70+
git push origin HEAD:${{ env.HEADREF }}
71+
echo "changes_pushed=true" >> $GITHUB_OUTPUT
72+
else
73+
echo "changes_pushed=false" >> $GITHUB_OUTPUT
74+
fi
75+
- name: Comment on PR
76+
if: steps.commit_and_push.outputs.changes_pushed == 'true'
77+
uses: actions/github-script@v6
78+
with:
79+
script: |
80+
const prNumber = ${{ steps.pr_info.outputs.prNumber }};
81+
const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
82+
await github.rest.issues.createComment({
83+
owner: context.repo.owner,
84+
repo: context.repo.repo,
85+
issue_number: prNumber,
86+
body: `Style fixes have been applied. [View the workflow run here](${runUrl}).`
87+
});

docs/source/adding-a-new-metric.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig
155155

156156
task = LightevalTaskConfig(
157157
name="my_custom_task",
158-
suite=["community"],
159158
metric=[my_custom_metric], # Use your custom metric here
160159
prompt_function=my_prompt_function,
161160
hf_repo="my_dataset",

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ build-backend = "setuptools.build_meta"
3030
[tool.setuptools.packages.find]
3131
where = ["src"]
3232

33+
[tool.setuptools.package-data]
34+
lighteval = ["py.typed"]
35+
3336
[project]
3437
name = "lighteval"
3538
version = "0.11.1.dev0"

src/lighteval/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
app.command(rich_help_panel="Evaluation Backends")(lighteval.main_custom.custom)
7272
app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang)
7373
app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.eval)
74+
app.command(rich_help_panel="EvaluationUtils")(lighteval.main_inspect.bundle)
7475
app.add_typer(
7576
lighteval.main_endpoint.app,
7677
name="endpoint",

src/lighteval/main_inspect.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from inspect_ai import Epochs, Task, task
3131
from inspect_ai import eval_set as inspect_ai_eval_set
3232
from inspect_ai.dataset import hf_dataset
33+
from inspect_ai.log import bundle_log_dir
3334
from inspect_ai.scorer import exact
3435
from inspect_ai.solver import generate, system_message
3536
from pytablewriter import MarkdownTableWriter
@@ -519,6 +520,13 @@ def eval( # noqa C901
519520
print("run 'inspect view' to view the results")
520521

521522

523+
def bundle(log_dir: str, output_dir: str, overwrite: bool = True, repo_id: str | None = None, public: bool = False):
524+
bundle_log_dir(log_dir=log_dir, output_dir=output_dir, overwrite=overwrite)
525+
526+
if repo_id is not None:
527+
push_to_hub(output_dir, repo_id, public=public)
528+
529+
522530
if __name__ == "__main__":
523531
task = "lighteval|gsm8k|5,lighteval|gsm8k|1,lighteval|gsm8k|0"
524532
task = "lighteval|agieval|0"

src/lighteval/models/transformers/transformers_model.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@
6666

6767
STARTING_BATCH_SIZE = 512
6868

69+
# Thread local param
70+
torch.set_grad_enabled(False)
71+
6972

7073
class TransformersModelConfig(ModelConfig):
7174
"""Configuration class for HuggingFace Transformers models.
@@ -218,12 +221,6 @@ def __init__(
218221
if config.model_parallel is False and self.config.dtype not in ["4bit", "8bit"]:
219222
logger.info(f"Using Data Parallelism, putting model on device {self._device}")
220223
self.model = self.model.to(self._device)
221-
if config.compile:
222-
try:
223-
logger.info("Compiling the model")
224-
self.model.model.compile()
225-
except AttributeError as e:
226-
logger.warning("Could not compile the model because: ", e)
227224

228225
self.model_name = _simplify_name(config.model_name)
229226

@@ -410,7 +407,7 @@ def _create_auto_model(self) -> transformers.PreTrainedModel:
410407
)
411408
# model.to(self.device)
412409
model.eval()
413-
torch.set_grad_enabled(False)
410+
414411
if self.continuous_batching:
415412
generation_config = GenerationConfig(
416413
**self.generation_config_dict,
@@ -497,9 +494,6 @@ def _check_continuations_start_space(self, continuation: str) -> str:
497494
continuation = continuation.lstrip()
498495
return continuation
499496

500-
def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
501-
return self.model(inputs).logits
502-
503497
def _get_batch_size(self, max_input_length: int, override_bs: int | None, starting_batch_size: int = 512) -> int:
504498
if override_bs is not None:
505499
return override_bs
@@ -509,10 +503,18 @@ def _get_batch_size(self, max_input_length: int, override_bs: int | None, starti
509503
starting_batch_size=starting_batch_size
510504
) # if OOM, then halves batch_size and tries again
511505
def forward_batch(batch_size):
512-
test_batch = torch.ones(
513-
(batch_size + int(0.1 * batch_size), max_input_length), device=self.device
514-
).long() # We add 10% for marging :)
515-
F.log_softmax(self._model_call(test_batch).float(), dim=-1).cpu()
506+
fake_batch, fake_output = None, None
507+
with torch.no_grad():
508+
try:
509+
fake_batch = torch.ones((batch_size, max_input_length), device=self.device).int()
510+
fake_output = F.log_softmax(self.model(fake_batch).logits, dim=-1).cpu()
511+
except Exception as e:
512+
for fake_item in [fake_batch, fake_output]:
513+
if fake_item is not None:
514+
fake_item.detach()
515+
del fake_item
516+
517+
raise e
516518
return batch_size
517519

518520
batch_size = forward_batch()
@@ -645,10 +647,14 @@ def _padded_greedy_until(
645647
position=0,
646648
disable=self.disable_tqdm,
647649
):
648-
if split[0].generation_size is None:
650+
if self.generation_config_dict.get("max_new_tokens", None) is not None:
651+
# The user forces a specific generation size
652+
max_context_continuation_size_allowed = self.generation_config_dict["max_new_tokens"]
653+
elif split[0].generation_size is None:
649654
# No constraints on the generation size: max length allowed is the max model context
650655
max_context_continuation_size_allowed = self.max_length
651656
else:
657+
# The task forces a specific generation size
652658
context = self.prompt_manager.prepare_prompt(split[0])
653659
tokenized_context = self.tokenizer(context)
654660

@@ -953,7 +959,7 @@ def _loglikelihood_tokens( # noqa: C901
953959
max_context=None, # computed as model max length in the function
954960
)
955961

956-
model_output = self._model_call(prepared_batch.input_ids)
962+
model_output = self.model(prepared_batch.input_ids).logits
957963
logits = F.log_softmax(model_output, dim=-1) # [batch, sequence_length, vocab]
958964

959965
flat_index = 0

src/lighteval/py.typed

Whitespace-only changes.

src/lighteval/tasks/lighteval_task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class LightevalTaskConfig:
6060
name as input.
6161
hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
6262
hf_subset (str): Dataset subset/configuration name to use for this task.
63-
metrics (ListLike[Metric]): List of metrics to compute for this task.
63+
metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
6464
6565
Dataset Configuration:
6666
hf_revision (str | None, optional): Specific dataset revision to use.
@@ -112,7 +112,7 @@ class LightevalTaskConfig:
112112
] # The prompt function should be used to map a line in the dataset to a Sample
113113
hf_repo: str
114114
hf_subset: str
115-
metrics: ListLike[Metric] # List of metric , should be configurable
115+
metrics: ListLike[Metric | Metrics] # Accept both Metric objects and Metrics enums
116116

117117
# Inspect AI compatible parameters
118118
solver: None = None

0 commit comments

Comments
 (0)