demo

2025-07-21 16:53:47 -07:00 · 2025-07-21 16:53:47 -07:00 · 76f394b852
commit 76f394b852
parent 8b8baf9151
15 changed files with 195 additions and 326 deletions
--- a/bun.lock
+++ b/bun.lock
@ -146,6 +146,7 @@
        "@workshop/shared": "workspace:*",
        "hono": "catalog:",
        "luxon": "^3.7.1",
+        "pixabay-api": "^1.0.4",
        "pngjs": "^7.0.0",
        "tailwind": "^4.0.0",
        "zod": "catalog:",
@ -284,6 +285,8 @@

    "available-typed-arrays": ["available-typed-arrays@1.0.7", "", { "dependencies": { "possible-typed-array-names": "^1.0.0" } }, "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ=="],

+    "axios": ["axios@0.16.2", "", { "dependencies": { "follow-redirects": "^1.2.3", "is-buffer": "^1.1.5" } }, "sha512-IMYFDrcVbUksQhsMYtWCM6KdNaDpr1NY56dpzaIgj92ecPVI29bf2sOgAf8aGTiq8UoixJD61Pj0Ahej5DPv7w=="],
+
    "babel-runtime": ["babel-runtime@6.26.0", "", { "dependencies": { "core-js": "^2.4.0", "regenerator-runtime": "^0.11.0" } }, "sha512-ITKNuq2wKlW1fJg9sSW52eepoYgZBggvOAHC0u/CYu/qxQ9EVzThCgR69BnSXLHjy2f7SY5zaQ4yt7H9ZVxY2g=="],

    "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="],
@ -424,6 +427,8 @@

    "flaschenpost": ["flaschenpost@1.1.3", "", { "dependencies": { "@babel/runtime": "7.2.0", "app-root-path": "2.1.0", "babel-runtime": "6.26.0", "chalk": "2.4.1", "find-root": "1.1.0", "lodash": "4.17.11", "moment": "2.22.2", "processenv": "1.1.0", "split2": "3.0.0", "stack-trace": "0.0.10", "stringify-object": "3.3.0", "untildify": "3.0.3", "util.promisify": "1.0.0", "varname": "2.0.3" }, "bin": { "flaschenpost-uncork": "dist/bin/flaschenpost-uncork.js", "flaschenpost-normalize": "dist/bin/flaschenpost-normalize.js" } }, "sha512-1VAYPvDsVBGFJyUrOa/6clnJwZYC3qVq9nJLcypy6lvaaNbo1wOQiH8HQ+4Fw/k51pVG7JHzSf5epb8lmIW86g=="],

+    "follow-redirects": ["follow-redirects@1.15.9", "", {}, "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ=="],
+
    "for-each": ["for-each@0.3.5", "", { "dependencies": { "is-callable": "^1.2.7" } }, "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg=="],

    "formats": ["formats@1.0.0", "", {}, "sha512-For0Y8egwEK96JgJo4NONErPhtl7H2QzeB2NYGmzeGeJ8a1JZqPgLYOtM3oJRCYhmgsdDFd6KGRYyfe37XY4Yg=="],
@ -498,6 +503,8 @@

    "is-boolean-object": ["is-boolean-object@1.2.2", "", { "dependencies": { "call-bound": "^1.0.3", "has-tostringtag": "^1.0.2" } }, "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A=="],

+    "is-buffer": ["is-buffer@1.1.6", "", {}, "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="],
+
    "is-callable": ["is-callable@1.2.7", "", {}, "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA=="],

    "is-data-view": ["is-data-view@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "get-intrinsic": "^1.2.6", "is-typed-array": "^1.1.13" } }, "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw=="],
@ -644,6 +651,8 @@

    "path-to-regexp": ["path-to-regexp@0.1.7", "", {}, "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ=="],

+    "pixabay-api": ["pixabay-api@1.0.4", "", { "dependencies": { "@types/node": "^8.0.4", "axios": "^0.16.2" } }, "sha512-OmV0ciG+Ouosn8csp8fBta32HFAfYurKUYb4vgZphIiPneXHS4x3bNilSWiWpU7SdWAGBXnAKQWsl1s2g7E8eQ=="],
+
    "pkce-challenge": ["pkce-challenge@5.0.0", "", {}, "sha512-ueGLflrrnvwB3xuo/uGob5pd5FN7l0MsLf0Z87o/UQmRtwjvfylfc9MurIxRAWywCYTgrvpXBcqjV4OfCYGCIQ=="],

    "pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="],
@ -880,6 +889,8 @@

    "morgan/debug": ["debug@2.6.9", "", { "dependencies": { "ms": "2.0.0" } }, "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA=="],

+    "pixabay-api/@types/node": ["@types/node@8.10.66", "", {}, "sha512-tktOkFUA4kXx2hhhrB8bIFb5TbwzS4uOhKEmwiD+NoiL0qtP2OQ9mFldbgD4dV1djrlBYP6eBuQZiWjuHUpqFw=="],
+
    "router/depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="],

    "router/path-to-regexp": ["path-to-regexp@8.2.0", "", {}, "sha512-TdrF7fW9Rphjq4RjrW0Kp2AW0Ahwu9sRGTkS6bvDi0SCwZlEZYmcfDbEsTz8RVk0EHIS/Vd1bv3JhG+1xZuAyQ=="],
--- a/packages/whiteboard/README.md
+++ b/packages/whiteboard/README.md
@ -1,15 +1,5 @@
-# project-whitespace
+# whitespace

-To install dependencies:
+# Demo

-```bash
-bun install
-```
-
-To run:
-
-```bash
-bun run index.ts
-```
-
-This project was created using `bun init` in bun v1.2.18. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.
+https://share.cleanshot.com/94tmJzCw
--- a/packages/whiteboard/package.json
+++ b/packages/whiteboard/package.json
@ -17,6 +17,7 @@
    "@workshop/shared": "workspace:*",
    "hono": "catalog:",
    "luxon": "^3.7.1",
+    "pixabay-api": "^1.0.4",
    "pngjs": "^7.0.0",
    "tailwind": "^4.0.0",
    "zod": "catalog:"
--- a/packages/whiteboard/public/whiteboard.png
+++ b/packages/whiteboard/public/whiteboard.png
--- a/packages/whiteboard/src/agent.ts
+++ b/packages/whiteboard/src/agent.ts
@ -1,23 +0,0 @@
-import { tool, RealtimeAgent } from "@openai/agents/realtime"
-import { run } from "@openai/agents"
-
-// 1. Define a tool to fetch the latest whiteboard image
-const fetchWhiteboard = tool({
-  name: "fetchWhiteboard",
-  description: "Fetch the latest whiteboard image and return its bytes",
-  parameters: undefined,
-  execute: async () => {
-    return await Bun.file("public/whiteboard.png").arrayBuffer()
-  },
-})
-
-async function main() {
-  const agent = new RealtimeAgent({
-    name: "Spike",
-    instructions: "When asked to analyze the whiteboard, call fetchWhiteboard",
-    tools: [fetchWhiteboard],
-  })
-
-  const result = await run(agent, "Hey Spike, analyze the whiteboard.")
-  console.log("Agent response:", result.finalOutput)
-}
--- a/packages/whiteboard/src/opencv.ts
+++ b/packages/whiteboard/src/opencv.ts
@ -1,132 +0,0 @@
-import cvReady from "@techstark/opencv-js"
-import { PNG } from "pngjs"
-
-type Element = {
-  ymin: number
-  xmin: number
-  ymax: number
-  xmax: number
-  label: string
-}
-type StructuredResponse = { elements: Element[] }
-
-export const detectShapes = async (
-  imageBuffer: ArrayBuffer,
-  minAreaPercent = 5,
-  maxAreaPercent = 33
-): Promise<StructuredResponse> => {
-  const cv = await cvReady
-
-  // 1. Load & decode PNG → raw RGBA buffer
-  const buf = Buffer.from(imageBuffer)
-  const { width, height, data } = PNG.sync.read(buf)
-
-  // 2. Create a 4-ch Mat from RGBA pixels
-  const srcRGBA = cv.matFromArray(height, width, cv.CV_8UC4, new Uint8Array(data))
-
-  // 3. Convert → gray → blur → threshold
-  const gray = new cv.Mat()
-  cv.cvtColor(srcRGBA, gray, cv.COLOR_RGBA2GRAY)
-  cv.GaussianBlur(gray, gray, new cv.Size(5, 5), 0)
-
-  const thresh = new cv.Mat()
-  cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2)
-  // Morphological opening to remove small noise
-  const removeNoise = (mat: cvReady.Mat, kSize = 3) => {
-    const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
-    cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel)
-    kernel.delete()
-  }
-  // Morphological closing to bridge gaps in contours
-  const closeGaps = (mat: cvReady.Mat, kSize = 7) => {
-    const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
-    cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel)
-    kernel.delete()
-  }
-  removeNoise(thresh, 3)
-  closeGaps(thresh, 7)
-
-  // 4. Find contours
-  const contours = new cv.MatVector()
-  const hierarchy = new cv.Mat()
-  cv.findContours(thresh, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
-
-  const norm = (v: number, max: number) => Math.round((v / max) * 1000)
-  const totalImageArea = width * height
-  const elements: Element[] = []
-
-  for (let i = 0; i < contours.size(); i++) {
-    const cnt = contours.get(i)
-    const rect = cv.boundingRect(cnt)
-
-    // Skip shapes whose bounding box touches the image border
-    if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) {
-      // console.log(
-      //   `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})`
-      // )
-      cnt.delete()
-      continue
-    }
-
-    // Calculate area based on bounding box
-    const rectArea = rect.width * rect.height
-    const areaPercent = (rectArea / totalImageArea) * 100
-
-    // Basic filtering (lower bound only; upper bound filter disabled)
-    if (areaPercent < minAreaPercent) {
-      // cnt.delete()
-      continue
-    } else if (areaPercent > maxAreaPercent) {
-      // cnt.delete()
-      continue
-    }
-    // console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`)
-
-    /*
-    const margin = Math.min(width, height) * 0.05
-    if (
-      rect.x < margin ||
-      rect.y < margin ||
-      rect.x + rect.width > width - margin ||
-      rect.y + rect.height > height - margin
-    ) {
-      // cnt.delete()
-      continue
-    }
-    */
-
-    // Simple shape classification
-    const peri = cv.arcLength(cnt, true)
-    const approx = new cv.Mat()
-    cv.approxPolyDP(cnt, approx, 0.02 * peri, true)
-
-    let label = "polygon"
-    if (approx.rows === 3) label = "triangle"
-    else if (approx.rows === 4) {
-      const aspectRatio = rect.width / rect.height
-      label = Math.abs(aspectRatio - 1) < 0.2 ? "square" : "rectangle"
-    } else if (approx.rows > 6) label = "circle"
-
-    elements.push({
-      ymin: norm(rect.y, gray.rows),
-      xmin: norm(rect.x, gray.cols),
-      ymax: norm(rect.y + rect.height, gray.rows),
-      xmax: norm(rect.x + rect.width, gray.cols),
-      label,
-    })
-
-    console.log(
-      `-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${
-        rect.height
-      }) area=${areaPercent.toFixed(2)}%`
-    )
-
-    cnt.delete()
-    approx.delete()
-  }
-
-  // 5. Cleanup
-  ;[srcRGBA, gray, thresh, contours, hierarchy].forEach((m: any) => m.delete())
-
-  return { elements }
-}
--- a/packages/whiteboard/src/result.json
+++ b/packages/whiteboard/src/result.json
@ -1,39 +0,0 @@
-{
-  "elements": [
-    {
-      "ymin": 583,
-      "xmin": 97,
-      "ymax": 744,
-      "xmax": 392,
-      "label": "rectangle"
-    },
-    {
-      "ymin": 471,
-      "xmin": 455,
-      "ymax": 680,
-      "xmax": 664,
-      "label": "circle"
-    },
-    {
-      "ymin": 349,
-      "xmin": 173,
-      "ymax": 442,
-      "xmax": 296,
-      "label": "circle"
-    },
-    {
-      "ymin": 303,
-      "xmin": 432,
-      "ymax": 466,
-      "xmax": 589,
-      "label": "circle"
-    },
-    {
-      "ymin": 49,
-      "xmin": 87,
-      "ymax": 255,
-      "xmax": 368,
-      "label": "circle"
-    }
-  ]
-}
--- a/packages/whiteboard/src/routes/index.tsx
+++ b/packages/whiteboard/src/routes/index.tsx
@ -16,7 +16,7 @@ const categories = [
 const prompts = {
  default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`,
  simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`,
-  specific: `Detect 2d inscribed box for the green circle?`,
+  specific: `Detect 2d bounding box for the tea kettle in the image. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`,
 }

 export const action = async (req: Request, params: {}) => {
--- a/packages/whiteboard/src/routes/realtime.tsx
+++ b/packages/whiteboard/src/routes/realtime.tsx
@ -1,64 +0,0 @@
-import { Form, useAction } from "@workshop/nano-remix"
-import { useEffect, useRef } from "hono/jsx"
-import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime"
-import { ensure } from "@workshop/shared/utils"
-
-export const action = async (request: Request) => {
-  const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
-    method: "POST",
-    headers: {
-      Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify({
-      model: "gpt-4o-realtime-preview-2025-06-03",
-    }),
-  })
-  const { client_secret } = await response.json()
-
-  return { secret: client_secret?.value }
-}
-
-export default function Voice() {
-  const { data, loading, error } = useAction<typeof action>()
-  const session = useRef<RealtimeSession | undefined>(undefined)
-
-  useEffect(() => {
-    if (!data?.secret) return
-    if (session.current) return
-
-    session.current = createSession()
-    session.current.connect({ apiKey: data.secret })
-  }, [data?.secret])
-
-  return (
-    <div>
-      {error && <p>Error: {error}</p>}
-      <p>Ephemeral Key: {loading ? "Loading..." : data?.secret}</p>
-      <Form name="voiceForm">
-        <button type="submit">Start Voice Session</button>
-      </Form>
-    </div>
-  )
-}
-
-const createSession = () => {
-  const agent = new RealtimeAgent({
-    name: "Assistant",
-    voice: "echo",
-    instructions: `
-You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things.
-
-# Voice Tone
-
-You have a very quiet and have a slight accent that is hard to place.
-
-`,
-  })
-  const session = new RealtimeSession(agent)
-  session.on("error", (error) => {
-    console.error("Session error:", error)
-  })
-
-  return session
-}
--- a/packages/whiteboard/src/routes/voice.tsx
+++ b/packages/whiteboard/src/routes/voice.tsx
@ -1,16 +1,21 @@
 import { useRef, useState, useEffect } from "hono/jsx"
 import { useStreamingAI } from "../useStreamingAI"
 import { useVideo } from "../useVideo"
-import { VideoOverlay, type OverlayItem } from "../videoOverlay"
+import { VideoOverlay } from "../videoOverlay"
 import "../index.css"
+import type { OverlayItem } from "../types"

 export default function Voice() {
-  const { audioError, transcript, isRecording: audioRecording, waitingForResponse } = useStreamingAI()
+  const {
+    audioError,
+    transcript,
+    isRecording: audioRecording,
+    waitingForResponse,
+    overlays,
+  } = useStreamingAI()
  const videoRef = useRef<HTMLVideoElement>(null)
  const video = useVideo(videoRef)

-  const [overlays, setOverlays] = useState<OverlayItem[]>([])
-
  let recordingStateClass = ""
  if (audioRecording) recordingStateClass = "border-red-500 border-4"
  else if (waitingForResponse) recordingStateClass = "border-yellow-500 border-4"
@ -20,14 +25,12 @@ export default function Voice() {
      {audioError && <p class="text-red-500">Audio Error: {audioError}</p>}
      {video.error && <p class="text-red-500">Video Error: {video.error}</p>}

-      {transcript && <div class="absolute top-5 left-5 right-5 bg-white/90 p-4 rounded-lg">{transcript}</div>}
-
      {!video.isRecording && (
        <button
          onClick={video.toggleRecording}
-          class="px-4 py-2 text-8xl rounded-2xl text-white bg-green-500 hover:bg-green-600"
+          class="px-4 uppercase py-2 text-8xl rounded-2xl text-white bg-green-500 hover:bg-green-600"
        >
-          Start Camera
+          Start THE Camera
        </button>
      )}

@ -41,6 +44,8 @@ export default function Voice() {
        />
      </VideoOverlay>
      {video.isRecording && <div class="text-sm italic text-center">Hold Space to ask a question</div>}
+
+      {transcript && <div class="absolute top-5 left-5 right-5 bg-white/90 p-4 rounded-lg">{transcript}</div>}
    </div>
  )
 }
--- a/packages/whiteboard/src/server.ts
+++ b/packages/whiteboard/src/server.ts
@ -1,9 +1,11 @@
 import { nanoRemix } from "@workshop/nano-remix"
 import { OpenAI } from "openai"
-import { Agent, run, type AgentInputItem } from "@openai/agents"
+import { Agent, run, webSearchTool, type AgentInputItem } from "@openai/agents"
 import fs from "node:fs"
 import { getErrorMessage } from "@workshop/shared/errors"
 import { tools } from "./tools"
+import { OverlayItemSchema } from "./types"
+import z from "zod"

 Bun.serve({
  port: 3000,
@ -37,8 +39,12 @@ const streamResponse = async (req: Request) => {
  const agent = new Agent({
    name: "Whiteboard Assistant",
    model: "gpt-4o",
-    instructions: "You are a helpful assistant that talks about a whiteboard.",
-    tools,
+    instructions: `You are a helpful assistant that talks about an image.
+You will receive a transcript of a conversation and an image. Your task is to analyze the transcript and the image, then generate a response that includes text and optional overlays on the image.
+
+The overlays are a string description of what you would overlay on the image.
+`,
+    tools: [...tools, webSearchTool()],
  })

  const imagePath = "public/whiteboard.png"
@ -55,14 +61,41 @@ const streamResponse = async (req: Request) => {
  ]

  const result = await run(agent, input, { stream: true })
-  const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any

-  console.log(`🌭`, readableStream)
-  return new Response(readableStream, {
+  const customStream = new ReadableStream({
+    async start(controller) {
+      try {
+        for await (const chunk of result) {
+          if (chunk.type === "raw_model_stream_event" && chunk.data?.type === "output_text_delta") {
+            const event = {
+              type: "text_delta",
+              data: chunk.data.delta,
+            }
+            controller.enqueue(`data: ${JSON.stringify(event)}\n\n`)
+          }
+
+          if (chunk.type === "run_item_stream_event" && chunk.item?.type === "tool_call_output_item") {
+            const event = {
+              type: "tool_output",
+              data: chunk.item?.output,
+            }
+            controller.enqueue(`data: ${JSON.stringify(event)}\n\n`)
+          }
+        }
+
+        controller.close()
+      } catch (error) {
+        controller.error(error)
+      }
+    },
+  })
+
+  return new Response(customStream, {
    headers: {
-      "Content-Type": "text/plain",
+      "Content-Type": "text/event-stream",
      "Cache-Control": "no-cache",
      Connection: "keep-alive",
+      "Access-Control-Allow-Origin": "*",
    },
  })
 }
--- a/packages/whiteboard/src/tools.ts
+++ b/packages/whiteboard/src/tools.ts
@ -1,14 +1,44 @@
 import { tool } from "@openai/agents"
 import z from "zod"
+import type { ImageOverlay } from "./types"
+import { searchImages } from "pixabay-api"
+import { getGeminiResponse } from "./ai"

+const pixabayApiKey = "51428355-fea6dad6a1cb56273345b23b1"
 export const tools = [
  tool({
-    name: "embed video",
-    description: "Embed a video into the whiteboard",
-    parameters: z.object({ video: z.string() }),
-    execute(input, context) {
-      const { video } = input
-      return `Video embedded: ${video}`
+    name: "create an image overlay",
+    description: "Find an image to overlay on a video",
+    parameters: z.object({
+      whereToOverlay: z
+        .string()
+        .describe(
+          "Where to overlay the image (e.g., 'in the red box', 'covering the hand', 'on the left side')"
+        ),
+      imageQuery: z.string().describe("Search term for image"),
+    }),
+    async execute(input, context) {
+      const response = await searchImages(pixabayApiKey, input.imageQuery, { per_page: 10 })
+      const hit = response.hits[0]!
+
+      console.log(`🌭`, `Find the 2d bounding box for this "${input.whereToOverlay}"`)
+      const image = await Bun.file("public/whiteboard.png").arrayBuffer()
+      const boundingBox = await getGeminiResponse(
+        image,
+        `Find the 2d bounding box for this question "${input.whereToOverlay}"`
+      )
+
+      const element = boundingBox?.elements[0]!
+      const overlay: ImageOverlay = {
+        type: "image",
+        src: hit.webformatURL,
+        xmin: element.xmin,
+        ymin: element.ymin,
+        xmax: element.xmax,
+        ymax: element.ymax,
+      }
+
+      return overlay
    },
  }),
 ]
--- a/packages/whiteboard/src/types.ts
+++ b/packages/whiteboard/src/types.ts
@ -0,0 +1,30 @@
+import { z } from "zod"
+
+export const TextOverlaySchema = z.object({
+  type: z.literal("text"),
+  xmin: z.number(),
+  ymin: z.number(),
+  xmax: z.number(),
+  ymax: z.number(),
+  text: z.string(),
+  fontSize: z.number().optional().nullable(),
+  fontFamily: z.string().optional().nullable(),
+  color: z.string().optional().nullable(),
+  strokeColor: z.string().optional().nullable(),
+  strokeWidth: z.number().optional().nullable(),
+})
+
+export const ImageOverlaySchema = z.object({
+  type: z.literal("image"),
+  xmin: z.number(),
+  ymin: z.number(),
+  xmax: z.number(),
+  ymax: z.number(),
+  src: z.string(),
+})
+
+export const OverlayItemSchema = z.union([TextOverlaySchema, ImageOverlaySchema])
+
+export type TextOverlay = z.infer<typeof TextOverlaySchema>
+export type ImageOverlay = z.infer<typeof ImageOverlaySchema>
+export type OverlayItem = z.infer<typeof OverlayItemSchema>
--- a/packages/whiteboard/src/useStreamingAI.ts
+++ b/packages/whiteboard/src/useStreamingAI.ts
@ -1,11 +1,13 @@
 import { useEffect, useRef, useState } from "hono/jsx"
 import { StreamingResponse } from "./streamingAI"
+import type { OverlayItem } from "./types"

 export function useStreamingAI() {
  const [audioError, setAudioError] = useState<string>("")
  const [transcript, setTranscript] = useState<string>("")
  const [isRecording, setIsRecording] = useState(false)
  const [waitingForResponse, setWaitingForResponse] = useState(false)
+  const [overlays, setOverlays] = useState<OverlayItem[]>([])
  const streamingResponseRef = useRef<StreamingResponse>(null)

  const startRecording = async () => {
@ -23,14 +25,44 @@ export function useStreamingAI() {
      const reader = await streamingResponseRef.current!.stop()
      setWaitingForResponse(false)
      const decoder = new TextDecoder()
+      let buffer = ""

+      const overlayItems: OverlayItem[] = []
      while (true) {
        const { done, value } = await reader.read()
        if (done) break

        const chunk = decoder.decode(value, { stream: true })
-        setTranscript((prev) => prev + chunk)
+        buffer += chunk
+
+        // Parse SSE messages
+        const lines = buffer.split("\n")
+        buffer = lines.pop() || ""
+
+        for (const line of lines) {
+          if (line.startsWith("data: ")) {
+            try {
+              const eventData = JSON.parse(line.slice(6))
+
+              if (eventData.type === "text_delta") {
+                setTranscript((prev) => prev + eventData.data)
+              } else if (eventData.type === "tool_output") {
+                if (eventData.data.type === "image") {
+                  overlayItems.push(eventData.data)
+                }
+              } else if (eventData.type === "done") {
+                console.log("Done")
+              }
+            } catch (e) {
+              console.error("💥 Failed to parse SSE event:", line, e)
+            }
+          } else if (line.trim()) {
+            console.error("💥 Non-data line:", line)
+          }
+        }
      }
+
+      setOverlays(overlayItems)
    } catch (error) {
      console.error("Error during streaming:", error)
      setAudioError(`Streaming failed: ${error}`)
@ -70,5 +102,6 @@ export function useStreamingAI() {
    waitingForResponse,
    startRecording,
    endRecording,
+    overlays,
  }
 }
--- a/packages/whiteboard/src/videoOverlay.tsx
+++ b/packages/whiteboard/src/videoOverlay.tsx
@ -1,27 +1,5 @@
 import { useRef, useEffect } from "hono/jsx"
-
-export interface TextOverlay {
-  type: "text"
-  x: number
-  y: number
-  text: string
-  fontSize?: number
-  fontFamily?: string
-  color?: string
-  strokeColor?: string
-  strokeWidth?: number
-}
-
-export interface ImageOverlay {
-  type: "image"
-  x: number
-  y: number
-  src: string
-  width?: number
-  height?: number
-}
-
-export type OverlayItem = TextOverlay | ImageOverlay
+import type { ImageOverlay, OverlayItem, TextOverlay } from "./types"

 interface VideoOverlayProps {
  overlays: OverlayItem[]
@ -50,7 +28,17 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr
    canvas.height = rect.height

    // Clear canvas
-    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    // ctx.clearRect(0, 0, canvas.width, canvas.height)
+    // ctx.fillStyle = "green"
+    // const xmin = 250
+    // const ymin = 250
+    // const xmax = 750
+    // const ymax = 750
+    // const x = (xmin / 1000) * ctx.canvas.width
+    // const y = (ymin / 1000) * ctx.canvas.height
+    // const width = ((xmax - xmin) / 1000) * ctx.canvas.width
+    // const height = ((ymax - ymin) / 1000) * ctx.canvas.height
+    // ctx.fillRect(x, y, width, height)

    // Draw overlays
    for (const overlay of overlays) {
@ -65,7 +53,6 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr
  // Redraw when overlay data changes or recording state changes
  useEffect(() => {
    setTimeout(() => {
-      console.log(`🌭 `, canvasRef.current?.width, canvasRef.current?.height)
      drawOverlays()
    }, 1000)
  }, [overlays, isRecording])
@ -78,8 +65,10 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr

  const drawText = (ctx: CanvasRenderingContext2D, overlay: TextOverlay) => {
    const {
-      x,
-      y,
+      xmin,
+      ymin,
+      xmax,
+      ymax,
      text,
      fontSize = 20,
      fontFamily = "Arial",
@ -92,21 +81,26 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr
    ctx.fillStyle = color
    ctx.strokeStyle = strokeColor
    ctx.lineWidth = strokeWidth
-
+    const x = (xmin / 1000) * ctx.canvas.width
+    const y = (ymin / 1000) * ctx.canvas.height
+    const width = ((xmax - xmin) / 1000) * ctx.canvas.width
+    const height = ((ymax - ymin) / 1000) * ctx.canvas.height
    ctx.strokeText(text, x, y)
    ctx.fillText(text, x, y)
  }

  const drawImage = (ctx: CanvasRenderingContext2D, overlay: ImageOverlay) => {
-    const { x, y, src, width, height } = overlay
+    const { xmin, ymin, xmax, ymax, src } = overlay

    const img = new Image()
    img.crossOrigin = "anonymous"

    img.onload = () => {
-      const drawWidth = width || img.width
-      const drawHeight = height || img.height
-      ctx.drawImage(img, x, y, drawWidth, drawHeight)
+      const x = (xmin / 1000) * ctx.canvas.width
+      const y = (ymin / 1000) * ctx.canvas.height
+      const width = ((xmax - xmin) / 1000) * ctx.canvas.width
+      const height = ((ymax - ymin) / 1000) * ctx.canvas.height
+      ctx.drawImage(img, x, y, width, height)
    }

    img.src = src