VisualPuzzles (EvolvingLMMs-Lab#637)

yueqis · web-flow · commit 43d616f2f6bf · 2025-04-18T01:34:21.000+08:00
diff --git a/lmms_eval/tasks/VisualPuzzles/VisualPuzzles_cot.yaml b/lmms_eval/tasks/VisualPuzzles/VisualPuzzles_cot.yaml
@@ -0,0 +1,29 @@
+dataset_path: neulab/VisualPuzzles
+dataset_kwargs:
+  token: True
+task: "VisualPuzzles_cot"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.VisualPuzzles_doc_to_visual
+doc_to_text: !function utils.VisualPuzzles_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+process_results: !function utils.VisualPuzzles_process_result
+metadata:
+  - version: 0.0
+  
+lmms_eval_specific_kwargs:
+  default:
+    prompt: "COT_PROMPT"
+    
diff --git a/lmms_eval/tasks/VisualPuzzles/VisualPuzzles_direct.yaml b/lmms_eval/tasks/VisualPuzzles/VisualPuzzles_direct.yaml
@@ -0,0 +1,29 @@
+dataset_path: neulab/VisualPuzzles
+dataset_kwargs:
+  token: True
+task: "VisualPuzzles_direct"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.VisualPuzzles_doc_to_visual
+doc_to_text: !function utils.VisualPuzzles_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+process_results: !function utils.VisualPuzzles_process_result
+metadata:
+  - version: 0.0
+  
+lmms_eval_specific_kwargs:
+  default:
+    prompt: "MULTI_CHOICE_DIRECT_PROMPT"
+    
diff --git a/lmms_eval/tasks/VisualPuzzles/utils.py b/lmms_eval/tasks/VisualPuzzles/utils.py
@@ -0,0 +1,104 @@
+from PIL import Image
+import random
+import numpy as np
+import re
+import json
+import os
+import random
+
+MULTI_CHOICE_DIRECT_PROMPT = "Answer the question with the option's letter from the given choices directly."
+COT_PROMPT = "Solve the multiple-choice question and then answer with the option letter from the given choices. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of options. Think step by step before answering."
+PROMPTS = {'MULTI_CHOICE_DIRECT_PROMPT': MULTI_CHOICE_DIRECT_PROMPT, 'COT_PROMPT': COT_PROMPT}
+
+def VisualPuzzles_doc_to_visual(doc):
+    return [doc['image']]
+
+def VisualPuzzles_doc_to_text(doc, lmms_eval_specific_kwargs):
+    question = 'Question: ' + doc["question"].strip()
+    options = doc['options']
+    if options != None: question += '\nOptions:\n(A) ' + options[0] + '\n(B) ' + options[1] + '\n(C) ' + options[2] + '\n(D) ' + options[3]
+    else: question += '\nOptions: Choose from (A) (B) (C) (D) in the image.'
+    question += '\n' + PROMPTS[lmms_eval_specific_kwargs['prompt']]
+    return question
+
+def parse_response(response, all_choices, index2ans):
+    """
+    Return the last letter appearing after 'ANSWER:' in the input text.
+    If there's no match, return None.
+    """
+    pattern = r'Answer:\s*\(([A-Za-z])\)' # Answer: (A)
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r'(?<!Final )Answer:\s*([A-Za-z])' # Answer: A
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r'Answer:\s*([A-Za-z])' # Answer: A
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r'\s*\(([A-Za-z])\)'  # e.g., (A) (B) (C) (D)
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    response = ' ' + response.strip()
+    pattern = r'\s*([A-Za-z])\)'   # e.g., A) B) C) D)
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r'\s*\{([A-Za-z])\}' # e.g., {A} {B} {C} {D}
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r'\s*\$([A-Za-z])\$' # e.g., $A$, $B$, $C$, $D$
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: return match
+    pattern = r" ([A-Da-d])\." # e.g., A. B. C. D.
+    matches = re.findall(pattern, response)
+    if matches: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: 
+                return match
+    pattern = r" ([A-Da-d])" # e.g., A B C D
+    matches = re.findall(pattern, response)
+    if matches and len(response) <= 5: 
+        for match in matches[::-1]:
+            if match in all_choices or match.upper() in all_choices: 
+                return match
+    if index2ans != None:
+        for index in all_choices:
+            ans = index2ans[index]
+            if f'answer: {ans}' in response.lower(): return index
+            if f'answer:{ans}' in response.lower(): return index
+        last_found = None
+        last_index = -1
+        for index in all_choices:
+            ans = index2ans[index]
+            idx = response.rfind(ans)
+            if idx > last_index:
+                last_found = index
+                last_index = idx
+        if last_found: return last_found
+    return random.choice(all_choices)
+
+def VisualPuzzles_process_result(doc, results):
+    print(f"results: {results}")
+    pred = results[0].strip()
+    all_choices = ['A', 'B', 'C', 'D']
+    if doc['options'] == None: index2ans = None
+    else: index2ans = {all_choices[i]: doc['options'][i] for i in range(4)}
+    pred = parse_response(pred, all_choices, index2ans)
+    target = doc['answer']
+    if pred.lower() == target.lower(): return {"exact_match": 1.0}
+    return {"exact_match": 0.0}
+
+