diff --git a/CHANGELOG.md b/CHANGELOG.md
index 55b92591a..339cea16b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Other
 
+- Add vision support to release evals so we can run the foundational examples 12
+  series.
+
 - Added foundational example `15a-switch-languages.py` to release evals. It is
   able to detect if we switched the language properly.
 
diff --git a/scripts/evals/assets/cat.jpg b/scripts/evals/assets/cat.jpg
new file mode 100644
index 000000000..700b5fc92
Binary files /dev/null and b/scripts/evals/assets/cat.jpg differ
diff --git a/scripts/evals/eval.py b/scripts/evals/eval.py
index c12f75fa5..b91f27c6e 100644
--- a/scripts/evals/eval.py
+++ b/scripts/evals/eval.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-import argparse
 import asyncio
 import io
 import os
@@ -13,11 +12,12 @@ import time
 import wave
 from datetime import datetime
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import aiofiles
 from deepgram import LiveOptions
 from loguru import logger
+from PIL.ImageFile import ImageFile
 from utils import (
     EvalResult,
     load_module_from_path,
@@ -30,7 +30,7 @@ from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import EndTaskFrame
+from pipecat.frames.frames import EndTaskFrame, OutputImageRawFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -49,6 +49,8 @@ SCRIPT_DIR = Path(__file__).resolve().parent
 PIPELINE_IDLE_TIMEOUT_SECS = 60
 EVAL_TIMEOUT_SECS = 90
 
+EvalPrompt = str | Tuple[str, ImageFile]
+
 
 class EvalRunner:
     def __init__(
@@ -87,7 +89,7 @@ class EvalRunner:
     async def assert_eval_false(self):
         await self._queue.put(False)
 
-    async def run_eval(self, example_file: str, prompt: str, eval: Optional[str] = None):
+    async def run_eval(self, example_file: str, prompt: EvalPrompt, eval: Optional[str] = None):
         if not re.match(self._pattern, example_file):
             return
 
@@ -178,6 +180,7 @@ async def run_example_pipeline(script_path: Path):
         DailyParams(
             audio_in_enabled=True,
             audio_out_enabled=True,
+            video_in_enabled=True,
             vad_analyzer=SileroVADAnalyzer(),
         ),
     )
@@ -189,7 +192,10 @@ async def run_example_pipeline(script_path: Path):
 
 
 async def run_eval_pipeline(
-    eval_runner: EvalRunner, example_file: str, prompt: str, eval: Optional[str]
+    eval_runner: EvalRunner,
+    example_file: str,
+    prompt: EvalPrompt,
+    eval: Optional[str],
 ):
     logger.info(f"Starting eval bot")
 
@@ -202,6 +208,7 @@ async def run_eval_pipeline(
         DailyParams(
             audio_in_enabled=True,
             audio_out_enabled=True,
+            video_out_enabled=True,
             vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=2.0)),
         ),
     )
@@ -242,6 +249,14 @@ async def run_eval_pipeline(
     )
     tools = ToolsSchema(standard_tools=[eval_function])
 
+    # Load example prompt depending on image.
+    example_prompt = ""
+    example_image: Optional[ImageFile] = None
+    if isinstance(prompt, str):
+        example_prompt = prompt
+    elif isinstance(prompt, tuple):
+        example_prompt, example_image = prompt
+
     # See if we need to include an eval prompt.
     eval_prompt = ""
     if eval:
@@ -250,7 +265,7 @@ async def run_eval_pipeline(
     messages = [
         {
             "role": "system",
-            "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
+            "content": f"You are an LLM eval, be extremly brief. Your goal is to only ask one question: {example_prompt}. Call the eval function only if the user answers the question and check if the answer is correct (words as numbers are valid). {eval_prompt}",
         },
     ]
 
@@ -288,6 +303,14 @@ async def run_eval_pipeline(
     @transport.event_handler("on_client_connected")
     async def on_client_connected(transport, client):
         logger.info(f"Client connected")
+        if example_image:
+            await task.queue_frame(
+                OutputImageRawFrame(
+                    image=example_image.tobytes(),
+                    size=example_image.size,
+                    format="RGB",
+                )
+            )
         await audio_buffer.start_recording()
 
     @transport.event_handler("on_client_disconnected")
diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py
index 5aac70c6b..6b6ac7eda 100644
--- a/scripts/evals/run-release-evals.py
+++ b/scripts/evals/run-release-evals.py
@@ -13,12 +13,15 @@ from pathlib import Path
 from dotenv import load_dotenv
 from eval import EvalRunner
 from loguru import logger
+from PIL import Image
 from utils import check_env_variables
 
 load_dotenv(override=True)
 
 SCRIPT_DIR = Path(__file__).resolve().parent
 
+ASSETS_DIR = SCRIPT_DIR / "assets"
+
 FOUNDATIONAL_DIR = SCRIPT_DIR.parent.parent / "examples" / "foundational"
 
 
@@ -39,6 +42,10 @@ EVAL_ONLINE_SEARCH = f"Today is {datetime.now(timezone.utc).strftime('%B %d, %Y'
 PROMPT_SWITCH_LANGUAGE = "Say something in Spanish."
 EVAL_SWITCH_LANGUAGE = "Check if the user is now talking in Spanish."
 
+# Vision
+PROMPT_VISION = ("What do you see?", Image.open(ASSETS_DIR / "cat.jpg"))
+EVAL_VISION = "A cat description."
+
 TESTS_07 = [
     # 07 series
     ("07-interruptible.py", PROMPT_SIMPLE_MATH, None),
@@ -81,6 +88,13 @@ TESTS_07 = [
     # ("07u-interruptible-ultravox.py", PROMPT_SIMPLE_MATH, None),
 ]
 
+TESTS_12 = [
+    ("12-describe-video.py", PROMPT_VISION, EVAL_VISION),
+    ("12a-describe-video-gemini-flash.py", PROMPT_VISION, EVAL_VISION),
+    ("12b-describe-video-gpt-4o.py", PROMPT_VISION, EVAL_VISION),
+    ("12c-describe-video-anthropic.py", PROMPT_VISION, EVAL_VISION),
+]
+
 TESTS_14 = [
     ("14-function-calling.py", PROMPT_WEATHER, EVAL_WEATHER),
     ("14a-function-calling-anthropic.py", PROMPT_WEATHER, EVAL_WEATHER),
@@ -142,6 +156,7 @@ TESTS_43 = [
 
 TESTS = [
     *TESTS_07,
+    *TESTS_12,
     *TESTS_14,
     *TESTS_15,
     *TESTS_19,
diff --git a/src/pipecat/transports/services/daily.py b/src/pipecat/transports/services/daily.py
index daf879be7..77f7f5e29 100644
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -560,7 +560,7 @@ class DailyTransportClient(EventHandler):
         self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate
 
         if self._params.audio_in_enabled:
-            if self._params.audio_in_user_tracks and not self._audio_task:
+            if self._params.audio_in_user_tracks and not self._audio_task and self._task_manager:
                 self._audio_queue = WatchdogQueue(self._task_manager)
                 self._audio_task = self._task_manager.create_task(
                     self._callback_task_handler(self._audio_queue),