diff --git a/docs/current_tasks.md b/docs/current_tasks.md
index 98fe0e319..5d48c4302 100644
--- a/docs/current_tasks.md
+++ b/docs/current_tasks.md
@@ -245,6 +245,7 @@ python -m lmms_eval --tasks list_with_num
   - egoschema_mcppl
   - egoschema_subset_mcppl
   - egoschema_subset
+- [LEMONADE](https://huggingface.co/datasets/amathislab/LEMONADE) (lemonade)
 - [LongVideoBench](https://github.com/longvideobench/LongVideoBench)
 - [MovieChat](https://github.com/rese1f/MovieChat) (moviechat)
   - Global Mode for entire video (moviechat_global)
diff --git a/lmms_eval/tasks/lemonade/README.md b/lmms_eval/tasks/lemonade/README.md
new file mode 100644
index 000000000..518379188
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/README.md
@@ -0,0 +1,45 @@
+# LEMONADE
+
+## Task Description  
+
+**LEMONADE** (Language models Evaluation of MOtion aNd Action-Driven Enquiries) is a QA benchmark extracted from the **EPFL-Smart-Kitchen-30** dataset (see [arXiv](https://arxiv.org/abs/2506.01608)). It consists of **36,521 closed-ended QA pairs** linked to egocentric video clips.  
+
+Questions are organized into three groups and six subcategories:  
+
+- **Behavior Understanding**  
+  - *Perception*: recognizing perceived actions  
+  - *Reasoning*: reasoning over unseen behaviors  
+- **Long-term Understanding**  
+  - *Summarization*: summarizing over longer clips  
+  - *Session Properties*: inferring session-level information  
+- **Motion & Biomechanics**  
+  - *Physical Attributes*: inferring hand shapes, joint angles, etc.  
+  - *Kinematics*: inferring trajectory velocities  
+
+The benchmark was evaluated using **`lmms-eval`** in the associated publication.  
+
+
+## Implementation  
+
+- **utils.py**: Handles data loading from Hugging Face, video loading, answer parsing, and metric evaluation.  
+- **lemonade.yaml**: Contains the default prompts and evaluation settings.
+
+When running LEMONADE through `lmms-eval`, the data is automatically downloaded. For direct dataset access, please refer to [Hugging Face](https://huggingface.co/datasets/amathislab/LEMONADE) or [Zenodo](https://zenodo.org/records/15535461).  
+
+Performance is evaluated in terms of accuracy against the ground truth, with results reported overall as well as per category and subcategory.
+
+## Citation  
+
+If you use **LEMONADE**, please cite:  
+
+```bibtex
+@misc{bonnetto2025epflsmartkitchen,
+      title={EPFL-Smart-Kitchen-30: Densely annotated cooking dataset with 3D kinematics to challenge video and language models}, 
+      author={Andy Bonnetto and Haozhe Qi and Franklin Leong and Matea Tashkovska and Mahdi Rad and Solaiman Shokur and Friedhelm Hummel and Silvestro Micera and Marc Pollefeys and Alexander Mathis},
+      year={2025},
+      eprint={2506.01608},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2506.01608}, 
+}
+```
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/lemonade.yaml b/lmms_eval/tasks/lemonade/lemonade.yaml
new file mode 100644
index 000000000..e4263faf5
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/lemonade.yaml
@@ -0,0 +1,28 @@
+dataset_path: amathislab/LEMONADE
+dataset_kwargs:
+  video: true
+  cache_dir: lemonade_data
+  force_unzip: true
+task: "lemonade"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.lemonade_doc_to_visual
+doc_to_text: !function utils.lemonade_doc_to_text
+doc_to_target: "Correct Answer"
+
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  do_sample: false
+
+process_results: !function utils.lemonade_process_results
+metric_list:
+  - metric: acc
+    aggregation: !function utils.lemonade_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Answer the following multiple-choice question using the given images.\n"
+    post_prompt: "\nRespond only with the letter of the correct answer."
+  max_num_frames: 8
\ No newline at end of file
diff --git a/lmms_eval/tasks/lemonade/utils.py b/lmms_eval/tasks/lemonade/utils.py
new file mode 100644
index 000000000..ad43fc05d
--- /dev/null
+++ b/lmms_eval/tasks/lemonade/utils.py
@@ -0,0 +1,335 @@
+import ast
+import os
+import cv2
+import numpy as np
+import zipfile
+import yaml
+from collections import defaultdict
+from pathlib import Path
+from PIL import Image
+from typing import Any, Optional
+from huggingface_hub import hf_hub_download
+
+with open(Path(__file__).parent / "lemonade.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for line in raw_data:
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+HF_HOME = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(HF_HOME)
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+videos_dir = os.path.join(base_cache_dir, cache_dir)
+
+max_num_frames = config.get("lmms_eval_specific_kwargs", {}).get("max_num_frames", 8)
+
+
+def load_video(video_file: str, start_frame: int, end_frame: int, max_num_frames: int = max_num_frames) -> list[Image.Image]:
+    """
+    Args:
+        video_file: Path to the video file.
+        start_frame: Starting frame index.
+        end_frame: Ending frame index.
+        max_num_frames: Number of frames to sample from the video segment.
+    Returns:
+        List of PIL Image objects representing sampled frames
+    """
+
+    cap = cv2.VideoCapture(video_file)
+    try: 
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        start_frame = max(0, start_frame)
+        end_frame = min(end_frame, total_frames - 1)
+        total_valid_frames = end_frame - start_frame + 1
+        num_frames = min(max_num_frames, total_valid_frames)
+        step = total_valid_frames / num_frames
+        frame_indices = [int(start_frame + i * step) for i in range(num_frames)]
+        frames = []
+        for target_idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
+            success, frame = cap.read()
+            if not success:
+                continue
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_img = Image.fromarray(frame_rgb).convert("RGB")
+            frames.append(pil_img)
+
+        return frames
+    finally:
+        cap.release()
+
+def parse_options(options: list[str]) -> str:
+    """
+    Format a list of multiple-choice options into a string.
+    The function assigns letters to each option and returns them in a newline-separated string.    
+    
+    Args:
+        options (list[str]): A list of option strings.
+
+    Returns:
+        str: A formatted string with each option on a new line, prefixed by its corresponding letter.
+    """
+
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+
+    if all(option.startswith(f"{letter}.") for option, letter in zip(options, option_letters)):
+        return "\n".join(options)
+
+    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
+    return choices_str
+
+
+def lemonade_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
+    """
+    Load video frames for a given entry in the LEMONADE dataset.
+
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+    Returns:
+        frames: List of PIL Image objects representing sampled frames
+    """
+
+    video_filename = doc["Clip"] + "_hololens.mp4"
+    video_path = os.path.join(videos_dir, video_filename)
+
+    if os.path.exists(video_path):
+        start = int(doc["Start"])
+        end = int(doc["End"])
+        frames = load_video(video_path, start, end, max_num_frames=max_num_frames)
+    else:
+        raise FileNotFoundError(
+            f"Video file not found: {video_path}. "
+            f"Expected video for clip '{doc['Clip']}' at {video_path}"
+        )
+    return frames
+    
+
+def lemonade_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: Optional[dict[str, Any]] = None) -> str:
+    """
+    Convert a LEMONADE dataset entry into a formatted text prompt.
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        lmms_eval_specific_kwargs: Optional dictionary for additional prompt formatting.
+    Returns:
+        str: A formatted prompt string ready for model input
+    """
+
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+        
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    
+    question = "Question: " + doc["Question"]
+    parsed_options = parse_options(ast.literal_eval(doc["Answers"]))
+    choices = "Choices:\n" + parsed_options
+
+    return f"{pre_prompt}{question}\n{choices}{post_prompt}"
+
+
+def get_multi_choice_info(options: list[str]) -> tuple[dict[str, str], list[str]]:
+    """
+    Map a list of options to letter labels (A, B, C, ...).
+    
+    Args:
+        options: The set of answer options
+    Returns:
+        tuple[dict[str, str], list[str]]: 
+            - index2ans: Mapping from letters to option text.
+            - all_choices: List of the assigned letters.
+    """
+    
+    if not isinstance(options, list):
+        raise TypeError(f"Expected list of options, got {type(options)}: {options}")
+   
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def parse_multi_choice_response(response: str, all_choices: list[str], index2ans: dict[str, str]) -> str:
+    """
+    Parse a model response and return the predicted choice label (e.g., "A", "B", "C", "D"). 
+
+    Args:
+        response (str): The generated response to parse.
+        all_choices (list[str]): The set of valid choice labels.
+        index2ans (dict[str, str]): Mapping from choice labels to their full answer text.
+    Returns:
+        str: The predicted choice label.
+    """
+
+    if response == "API Error":
+        return "API Error"
+
+    if response == "":
+        return "Empty Response"
+
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " " 
+
+    index_ans = True
+    ans_with_brack = False
+    ans_with_period = False
+    ans_with_colon = False
+    candidates = []
+
+    for choice in all_choices:
+        if f"{choice}." in response:
+            candidates.append(choice)
+            ans_with_period = True
+    for choice in all_choices: 
+        if f"{choice}:" in response:
+            candidates.append(choice)
+            ans_with_colon = True
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"({choice})" in response:
+                candidates.append(choice)
+                ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices: 
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False 
+    if len(candidates) == 0:
+        pred_index = "A"
+
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_period:
+                for can in candidates:
+                    index = response.rfind(f"{can}.")
+                    start_indexes.append(index)
+            elif ans_with_colon:
+                for can in candidates:
+                    index = response.rfind(f"{can}:")
+                    start_indexes.append(index)
+            elif ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def lemonade_process_results(doc: dict[str, Any], results: list[Any]) -> dict[str, dict]:
+    """
+    Process the results from the model and compute accuracy.
+    
+    Args:
+        doc: A dictionary representing an entry in the dataset.
+        results: List of model outputs.
+    Returns:
+        A dictionary containing accuracy information. 
+    """
+    
+    pred = results[0]
+    index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["Answers"]))
+    parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+
+    acc = {
+        "QID": doc["QID"],
+        "category": doc["Category"],
+        "subcategory": doc["Subcategory"],
+        "difficulty": doc["Difficulty"],
+        "answer": doc["Correct Answer"],
+        "parsed_pred": parsed_pred,
+        "original_pred": pred
+    }    
+    return {"acc": acc}
+
+
+def lemonade_aggregate_results(results: list[dict[str, Any]]) -> float:
+    """
+    Aggregate the results from the evaluation.
+    
+    Args:
+        results: List of dicts containing individual evaluation results.
+    Returns:
+        overall_acc: Overall accuracy.
+
+    """
+    def compute_accuracy(grouped_results):
+        acc_dict = {}
+        for key, samples in grouped_results.items():
+            correct = sum([r["parsed_pred"] == r["answer"] for r in samples])
+            total = len(samples)
+            acc = round(correct / total, 5) if total > 0 else 0.0
+            stderr = round(np.sqrt(acc * (1 - acc) / total), 5) if total > 0 else 0.0
+            acc_dict[key] = {
+                "num": total,
+                "acc": acc,
+                "acc_stderr": stderr,
+            }
+        return acc_dict
+
+    qid_results = defaultdict(list)
+    category_results = defaultdict(list)
+    subcategory_results = defaultdict(list)
+    difficulty_results = defaultdict(list)
+
+    valid_results = [r for r in results if r["parsed_pred"] != "API Error"]
+
+    for r in valid_results:
+        qid_results[r["QID"]].append(r)
+        category_results[r["category"]].append(r)
+        subcategory_results[r["subcategory"]].append(r)
+        difficulty_results[r["difficulty"]].append(r)
+
+    qid_acc = compute_accuracy(qid_results)
+    category_acc = compute_accuracy(category_results)
+    subcategory_acc = compute_accuracy(subcategory_results)
+    difficulty_acc = compute_accuracy(difficulty_results)
+
+    total_correct = sum([r["parsed_pred"] == r["answer"] for r in valid_results])
+    total = len(valid_results)
+    overall_acc = round(total_correct / total, 5) if total > 0 else 0.0
+    overall_stderr = round(np.sqrt(overall_acc * (1 - overall_acc) / total), 5) if total > 0 else 0.0
+
+    print("\nResults:")
+
+    print("\nAccuracy per QID:")
+    for k, v in qid_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Category:")
+    for k, v in category_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Subcategory:")
+    for k, v in subcategory_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print("\nAccuracy per Difficulty:")
+    for k, v in difficulty_acc.items():
+        print(f"  {k}: {v['acc']} ± {v['acc_stderr']} ({v['num']} examples)")
+
+    print(f"\nOverall Accuracy: {overall_acc} ± {overall_stderr} ({total} examples)")
+
+    return overall_acc