diff --git a/bun.lock b/bun.lock index 57f24b2..b822b6d 100644 --- a/bun.lock +++ b/bun.lock @@ -146,6 +146,7 @@ "@workshop/shared": "workspace:*", "hono": "catalog:", "luxon": "^3.7.1", + "pixabay-api": "^1.0.4", "pngjs": "^7.0.0", "tailwind": "^4.0.0", "zod": "catalog:", @@ -284,6 +285,8 @@ "available-typed-arrays": ["available-typed-arrays@1.0.7", "", { "dependencies": { "possible-typed-array-names": "^1.0.0" } }, "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ=="], + "axios": ["axios@0.16.2", "", { "dependencies": { "follow-redirects": "^1.2.3", "is-buffer": "^1.1.5" } }, "sha512-IMYFDrcVbUksQhsMYtWCM6KdNaDpr1NY56dpzaIgj92ecPVI29bf2sOgAf8aGTiq8UoixJD61Pj0Ahej5DPv7w=="], + "babel-runtime": ["babel-runtime@6.26.0", "", { "dependencies": { "core-js": "^2.4.0", "regenerator-runtime": "^0.11.0" } }, "sha512-ITKNuq2wKlW1fJg9sSW52eepoYgZBggvOAHC0u/CYu/qxQ9EVzThCgR69BnSXLHjy2f7SY5zaQ4yt7H9ZVxY2g=="], "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="], @@ -424,6 +427,8 @@ "flaschenpost": ["flaschenpost@1.1.3", "", { "dependencies": { "@babel/runtime": "7.2.0", "app-root-path": "2.1.0", "babel-runtime": "6.26.0", "chalk": "2.4.1", "find-root": "1.1.0", "lodash": "4.17.11", "moment": "2.22.2", "processenv": "1.1.0", "split2": "3.0.0", "stack-trace": "0.0.10", "stringify-object": "3.3.0", "untildify": "3.0.3", "util.promisify": "1.0.0", "varname": "2.0.3" }, "bin": { "flaschenpost-uncork": "dist/bin/flaschenpost-uncork.js", "flaschenpost-normalize": "dist/bin/flaschenpost-normalize.js" } }, "sha512-1VAYPvDsVBGFJyUrOa/6clnJwZYC3qVq9nJLcypy6lvaaNbo1wOQiH8HQ+4Fw/k51pVG7JHzSf5epb8lmIW86g=="], + "follow-redirects": ["follow-redirects@1.15.9", "", {}, "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ=="], + "for-each": ["for-each@0.3.5", "", { "dependencies": { "is-callable": "^1.2.7" } }, "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg=="], "formats": ["formats@1.0.0", "", {}, "sha512-For0Y8egwEK96JgJo4NONErPhtl7H2QzeB2NYGmzeGeJ8a1JZqPgLYOtM3oJRCYhmgsdDFd6KGRYyfe37XY4Yg=="], @@ -498,6 +503,8 @@ "is-boolean-object": ["is-boolean-object@1.2.2", "", { "dependencies": { "call-bound": "^1.0.3", "has-tostringtag": "^1.0.2" } }, "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A=="], + "is-buffer": ["is-buffer@1.1.6", "", {}, "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="], + "is-callable": ["is-callable@1.2.7", "", {}, "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA=="], "is-data-view": ["is-data-view@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "get-intrinsic": "^1.2.6", "is-typed-array": "^1.1.13" } }, "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw=="], @@ -644,6 +651,8 @@ "path-to-regexp": ["path-to-regexp@0.1.7", "", {}, "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ=="], + "pixabay-api": ["pixabay-api@1.0.4", "", { "dependencies": { "@types/node": "^8.0.4", "axios": "^0.16.2" } }, "sha512-OmV0ciG+Ouosn8csp8fBta32HFAfYurKUYb4vgZphIiPneXHS4x3bNilSWiWpU7SdWAGBXnAKQWsl1s2g7E8eQ=="], + "pkce-challenge": ["pkce-challenge@5.0.0", "", {}, "sha512-ueGLflrrnvwB3xuo/uGob5pd5FN7l0MsLf0Z87o/UQmRtwjvfylfc9MurIxRAWywCYTgrvpXBcqjV4OfCYGCIQ=="], "pngjs": ["pngjs@7.0.0", "", {}, "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow=="], @@ -880,6 +889,8 @@ "morgan/debug": ["debug@2.6.9", "", { "dependencies": { "ms": "2.0.0" } }, "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA=="], + "pixabay-api/@types/node": ["@types/node@8.10.66", "", {}, "sha512-tktOkFUA4kXx2hhhrB8bIFb5TbwzS4uOhKEmwiD+NoiL0qtP2OQ9mFldbgD4dV1djrlBYP6eBuQZiWjuHUpqFw=="], + "router/depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="], "router/path-to-regexp": ["path-to-regexp@8.2.0", "", {}, "sha512-TdrF7fW9Rphjq4RjrW0Kp2AW0Ahwu9sRGTkS6bvDi0SCwZlEZYmcfDbEsTz8RVk0EHIS/Vd1bv3JhG+1xZuAyQ=="], diff --git a/packages/whiteboard/README.md b/packages/whiteboard/README.md index 28abc6c..fea3df7 100644 --- a/packages/whiteboard/README.md +++ b/packages/whiteboard/README.md @@ -1,15 +1,5 @@ -# project-whitespace +# whitespace -To install dependencies: +# Demo -```bash -bun install -``` - -To run: - -```bash -bun run index.ts -``` - -This project was created using `bun init` in bun v1.2.18. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime. +https://share.cleanshot.com/94tmJzCw diff --git a/packages/whiteboard/package.json b/packages/whiteboard/package.json index 7924266..71a2dc0 100644 --- a/packages/whiteboard/package.json +++ b/packages/whiteboard/package.json @@ -17,6 +17,7 @@ "@workshop/shared": "workspace:*", "hono": "catalog:", "luxon": "^3.7.1", + "pixabay-api": "^1.0.4", "pngjs": "^7.0.0", "tailwind": "^4.0.0", "zod": "catalog:" diff --git a/packages/whiteboard/public/whiteboard.png b/packages/whiteboard/public/whiteboard.png index 21b112d..2f0299a 100644 Binary files a/packages/whiteboard/public/whiteboard.png and b/packages/whiteboard/public/whiteboard.png differ diff --git a/packages/whiteboard/src/agent.ts b/packages/whiteboard/src/agent.ts deleted file mode 100644 index 217ce14..0000000 --- a/packages/whiteboard/src/agent.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { tool, RealtimeAgent } from "@openai/agents/realtime" -import { run } from "@openai/agents" - -// 1. Define a tool to fetch the latest whiteboard image -const fetchWhiteboard = tool({ - name: "fetchWhiteboard", - description: "Fetch the latest whiteboard image and return its bytes", - parameters: undefined, - execute: async () => { - return await Bun.file("public/whiteboard.png").arrayBuffer() - }, -}) - -async function main() { - const agent = new RealtimeAgent({ - name: "Spike", - instructions: "When asked to analyze the whiteboard, call fetchWhiteboard", - tools: [fetchWhiteboard], - }) - - const result = await run(agent, "Hey Spike, analyze the whiteboard.") - console.log("Agent response:", result.finalOutput) -} diff --git a/packages/whiteboard/src/opencv.ts b/packages/whiteboard/src/opencv.ts deleted file mode 100644 index 4e891e1..0000000 --- a/packages/whiteboard/src/opencv.ts +++ /dev/null @@ -1,132 +0,0 @@ -import cvReady from "@techstark/opencv-js" -import { PNG } from "pngjs" - -type Element = { - ymin: number - xmin: number - ymax: number - xmax: number - label: string -} -type StructuredResponse = { elements: Element[] } - -export const detectShapes = async ( - imageBuffer: ArrayBuffer, - minAreaPercent = 5, - maxAreaPercent = 33 -): Promise => { - const cv = await cvReady - - // 1. Load & decode PNG → raw RGBA buffer - const buf = Buffer.from(imageBuffer) - const { width, height, data } = PNG.sync.read(buf) - - // 2. Create a 4-ch Mat from RGBA pixels - const srcRGBA = cv.matFromArray(height, width, cv.CV_8UC4, new Uint8Array(data)) - - // 3. Convert → gray → blur → threshold - const gray = new cv.Mat() - cv.cvtColor(srcRGBA, gray, cv.COLOR_RGBA2GRAY) - cv.GaussianBlur(gray, gray, new cv.Size(5, 5), 0) - - const thresh = new cv.Mat() - cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2) - // Morphological opening to remove small noise - const removeNoise = (mat: cvReady.Mat, kSize = 3) => { - const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize)) - cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel) - kernel.delete() - } - // Morphological closing to bridge gaps in contours - const closeGaps = (mat: cvReady.Mat, kSize = 7) => { - const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize)) - cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel) - kernel.delete() - } - removeNoise(thresh, 3) - closeGaps(thresh, 7) - - // 4. Find contours - const contours = new cv.MatVector() - const hierarchy = new cv.Mat() - cv.findContours(thresh, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE) - - const norm = (v: number, max: number) => Math.round((v / max) * 1000) - const totalImageArea = width * height - const elements: Element[] = [] - - for (let i = 0; i < contours.size(); i++) { - const cnt = contours.get(i) - const rect = cv.boundingRect(cnt) - - // Skip shapes whose bounding box touches the image border - if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) { - // console.log( - // `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})` - // ) - cnt.delete() - continue - } - - // Calculate area based on bounding box - const rectArea = rect.width * rect.height - const areaPercent = (rectArea / totalImageArea) * 100 - - // Basic filtering (lower bound only; upper bound filter disabled) - if (areaPercent < minAreaPercent) { - // cnt.delete() - continue - } else if (areaPercent > maxAreaPercent) { - // cnt.delete() - continue - } - // console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`) - - /* - const margin = Math.min(width, height) * 0.05 - if ( - rect.x < margin || - rect.y < margin || - rect.x + rect.width > width - margin || - rect.y + rect.height > height - margin - ) { - // cnt.delete() - continue - } - */ - - // Simple shape classification - const peri = cv.arcLength(cnt, true) - const approx = new cv.Mat() - cv.approxPolyDP(cnt, approx, 0.02 * peri, true) - - let label = "polygon" - if (approx.rows === 3) label = "triangle" - else if (approx.rows === 4) { - const aspectRatio = rect.width / rect.height - label = Math.abs(aspectRatio - 1) < 0.2 ? "square" : "rectangle" - } else if (approx.rows > 6) label = "circle" - - elements.push({ - ymin: norm(rect.y, gray.rows), - xmin: norm(rect.x, gray.cols), - ymax: norm(rect.y + rect.height, gray.rows), - xmax: norm(rect.x + rect.width, gray.cols), - label, - }) - - console.log( - `-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${ - rect.height - }) area=${areaPercent.toFixed(2)}%` - ) - - cnt.delete() - approx.delete() - } - - // 5. Cleanup - ;[srcRGBA, gray, thresh, contours, hierarchy].forEach((m: any) => m.delete()) - - return { elements } -} diff --git a/packages/whiteboard/src/result.json b/packages/whiteboard/src/result.json deleted file mode 100644 index 9afa211..0000000 --- a/packages/whiteboard/src/result.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "elements": [ - { - "ymin": 583, - "xmin": 97, - "ymax": 744, - "xmax": 392, - "label": "rectangle" - }, - { - "ymin": 471, - "xmin": 455, - "ymax": 680, - "xmax": 664, - "label": "circle" - }, - { - "ymin": 349, - "xmin": 173, - "ymax": 442, - "xmax": 296, - "label": "circle" - }, - { - "ymin": 303, - "xmin": 432, - "ymax": 466, - "xmax": 589, - "label": "circle" - }, - { - "ymin": 49, - "xmin": 87, - "ymax": 255, - "xmax": 368, - "label": "circle" - } - ] -} diff --git a/packages/whiteboard/src/routes/index.tsx b/packages/whiteboard/src/routes/index.tsx index 4b3a5a2..0cd6008 100644 --- a/packages/whiteboard/src/routes/index.tsx +++ b/packages/whiteboard/src/routes/index.tsx @@ -16,7 +16,7 @@ const categories = [ const prompts = { default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`, simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`, - specific: `Detect 2d inscribed box for the green circle?`, + specific: `Detect 2d bounding box for the tea kettle in the image. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`, } export const action = async (req: Request, params: {}) => { diff --git a/packages/whiteboard/src/routes/realtime.tsx b/packages/whiteboard/src/routes/realtime.tsx deleted file mode 100644 index 932471a..0000000 --- a/packages/whiteboard/src/routes/realtime.tsx +++ /dev/null @@ -1,64 +0,0 @@ -import { Form, useAction } from "@workshop/nano-remix" -import { useEffect, useRef } from "hono/jsx" -import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime" -import { ensure } from "@workshop/shared/utils" - -export const action = async (request: Request) => { - const response = await fetch("https://api.openai.com/v1/realtime/sessions", { - method: "POST", - headers: { - Authorization: `Bearer ${process.env.OPENAI_API_KEY}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: "gpt-4o-realtime-preview-2025-06-03", - }), - }) - const { client_secret } = await response.json() - - return { secret: client_secret?.value } -} - -export default function Voice() { - const { data, loading, error } = useAction() - const session = useRef(undefined) - - useEffect(() => { - if (!data?.secret) return - if (session.current) return - - session.current = createSession() - session.current.connect({ apiKey: data.secret }) - }, [data?.secret]) - - return ( -
- {error &&

Error: {error}

} -

Ephemeral Key: {loading ? "Loading..." : data?.secret}

-
- -
-
- ) -} - -const createSession = () => { - const agent = new RealtimeAgent({ - name: "Assistant", - voice: "echo", - instructions: ` -You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things. - -# Voice Tone - -You have a very quiet and have a slight accent that is hard to place. - -`, - }) - const session = new RealtimeSession(agent) - session.on("error", (error) => { - console.error("Session error:", error) - }) - - return session -} diff --git a/packages/whiteboard/src/routes/voice.tsx b/packages/whiteboard/src/routes/voice.tsx index 399c207..f997a3c 100644 --- a/packages/whiteboard/src/routes/voice.tsx +++ b/packages/whiteboard/src/routes/voice.tsx @@ -1,16 +1,21 @@ import { useRef, useState, useEffect } from "hono/jsx" import { useStreamingAI } from "../useStreamingAI" import { useVideo } from "../useVideo" -import { VideoOverlay, type OverlayItem } from "../videoOverlay" +import { VideoOverlay } from "../videoOverlay" import "../index.css" +import type { OverlayItem } from "../types" export default function Voice() { - const { audioError, transcript, isRecording: audioRecording, waitingForResponse } = useStreamingAI() + const { + audioError, + transcript, + isRecording: audioRecording, + waitingForResponse, + overlays, + } = useStreamingAI() const videoRef = useRef(null) const video = useVideo(videoRef) - const [overlays, setOverlays] = useState([]) - let recordingStateClass = "" if (audioRecording) recordingStateClass = "border-red-500 border-4" else if (waitingForResponse) recordingStateClass = "border-yellow-500 border-4" @@ -20,14 +25,12 @@ export default function Voice() { {audioError &&

Audio Error: {audioError}

} {video.error &&

Video Error: {video.error}

} - {transcript &&
{transcript}
} - {!video.isRecording && ( )} @@ -41,6 +44,8 @@ export default function Voice() { /> {video.isRecording &&
Hold Space to ask a question
} + + {transcript &&
{transcript}
} ) } diff --git a/packages/whiteboard/src/server.ts b/packages/whiteboard/src/server.ts index 7444b90..682f472 100644 --- a/packages/whiteboard/src/server.ts +++ b/packages/whiteboard/src/server.ts @@ -1,9 +1,11 @@ import { nanoRemix } from "@workshop/nano-remix" import { OpenAI } from "openai" -import { Agent, run, type AgentInputItem } from "@openai/agents" +import { Agent, run, webSearchTool, type AgentInputItem } from "@openai/agents" import fs from "node:fs" import { getErrorMessage } from "@workshop/shared/errors" import { tools } from "./tools" +import { OverlayItemSchema } from "./types" +import z from "zod" Bun.serve({ port: 3000, @@ -37,8 +39,12 @@ const streamResponse = async (req: Request) => { const agent = new Agent({ name: "Whiteboard Assistant", model: "gpt-4o", - instructions: "You are a helpful assistant that talks about a whiteboard.", - tools, + instructions: `You are a helpful assistant that talks about an image. +You will receive a transcript of a conversation and an image. Your task is to analyze the transcript and the image, then generate a response that includes text and optional overlays on the image. + +The overlays are a string description of what you would overlay on the image. +`, + tools: [...tools, webSearchTool()], }) const imagePath = "public/whiteboard.png" @@ -55,14 +61,41 @@ const streamResponse = async (req: Request) => { ] const result = await run(agent, input, { stream: true }) - const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any - console.log(`🌭`, readableStream) - return new Response(readableStream, { + const customStream = new ReadableStream({ + async start(controller) { + try { + for await (const chunk of result) { + if (chunk.type === "raw_model_stream_event" && chunk.data?.type === "output_text_delta") { + const event = { + type: "text_delta", + data: chunk.data.delta, + } + controller.enqueue(`data: ${JSON.stringify(event)}\n\n`) + } + + if (chunk.type === "run_item_stream_event" && chunk.item?.type === "tool_call_output_item") { + const event = { + type: "tool_output", + data: chunk.item?.output, + } + controller.enqueue(`data: ${JSON.stringify(event)}\n\n`) + } + } + + controller.close() + } catch (error) { + controller.error(error) + } + }, + }) + + return new Response(customStream, { headers: { - "Content-Type": "text/plain", + "Content-Type": "text/event-stream", "Cache-Control": "no-cache", Connection: "keep-alive", + "Access-Control-Allow-Origin": "*", }, }) } diff --git a/packages/whiteboard/src/tools.ts b/packages/whiteboard/src/tools.ts index d699c74..47e1ced 100644 --- a/packages/whiteboard/src/tools.ts +++ b/packages/whiteboard/src/tools.ts @@ -1,14 +1,44 @@ import { tool } from "@openai/agents" import z from "zod" +import type { ImageOverlay } from "./types" +import { searchImages } from "pixabay-api" +import { getGeminiResponse } from "./ai" +const pixabayApiKey = "51428355-fea6dad6a1cb56273345b23b1" export const tools = [ tool({ - name: "embed video", - description: "Embed a video into the whiteboard", - parameters: z.object({ video: z.string() }), - execute(input, context) { - const { video } = input - return `Video embedded: ${video}` + name: "create an image overlay", + description: "Find an image to overlay on a video", + parameters: z.object({ + whereToOverlay: z + .string() + .describe( + "Where to overlay the image (e.g., 'in the red box', 'covering the hand', 'on the left side')" + ), + imageQuery: z.string().describe("Search term for image"), + }), + async execute(input, context) { + const response = await searchImages(pixabayApiKey, input.imageQuery, { per_page: 10 }) + const hit = response.hits[0]! + + console.log(`🌭`, `Find the 2d bounding box for this "${input.whereToOverlay}"`) + const image = await Bun.file("public/whiteboard.png").arrayBuffer() + const boundingBox = await getGeminiResponse( + image, + `Find the 2d bounding box for this question "${input.whereToOverlay}"` + ) + + const element = boundingBox?.elements[0]! + const overlay: ImageOverlay = { + type: "image", + src: hit.webformatURL, + xmin: element.xmin, + ymin: element.ymin, + xmax: element.xmax, + ymax: element.ymax, + } + + return overlay }, }), ] diff --git a/packages/whiteboard/src/types.ts b/packages/whiteboard/src/types.ts new file mode 100644 index 0000000..86f8761 --- /dev/null +++ b/packages/whiteboard/src/types.ts @@ -0,0 +1,30 @@ +import { z } from "zod" + +export const TextOverlaySchema = z.object({ + type: z.literal("text"), + xmin: z.number(), + ymin: z.number(), + xmax: z.number(), + ymax: z.number(), + text: z.string(), + fontSize: z.number().optional().nullable(), + fontFamily: z.string().optional().nullable(), + color: z.string().optional().nullable(), + strokeColor: z.string().optional().nullable(), + strokeWidth: z.number().optional().nullable(), +}) + +export const ImageOverlaySchema = z.object({ + type: z.literal("image"), + xmin: z.number(), + ymin: z.number(), + xmax: z.number(), + ymax: z.number(), + src: z.string(), +}) + +export const OverlayItemSchema = z.union([TextOverlaySchema, ImageOverlaySchema]) + +export type TextOverlay = z.infer +export type ImageOverlay = z.infer +export type OverlayItem = z.infer diff --git a/packages/whiteboard/src/useStreamingAI.ts b/packages/whiteboard/src/useStreamingAI.ts index 4dd2930..083c0dc 100644 --- a/packages/whiteboard/src/useStreamingAI.ts +++ b/packages/whiteboard/src/useStreamingAI.ts @@ -1,11 +1,13 @@ import { useEffect, useRef, useState } from "hono/jsx" import { StreamingResponse } from "./streamingAI" +import type { OverlayItem } from "./types" export function useStreamingAI() { const [audioError, setAudioError] = useState("") const [transcript, setTranscript] = useState("") const [isRecording, setIsRecording] = useState(false) const [waitingForResponse, setWaitingForResponse] = useState(false) + const [overlays, setOverlays] = useState([]) const streamingResponseRef = useRef(null) const startRecording = async () => { @@ -23,14 +25,44 @@ export function useStreamingAI() { const reader = await streamingResponseRef.current!.stop() setWaitingForResponse(false) const decoder = new TextDecoder() + let buffer = "" + const overlayItems: OverlayItem[] = [] while (true) { const { done, value } = await reader.read() if (done) break const chunk = decoder.decode(value, { stream: true }) - setTranscript((prev) => prev + chunk) + buffer += chunk + + // Parse SSE messages + const lines = buffer.split("\n") + buffer = lines.pop() || "" + + for (const line of lines) { + if (line.startsWith("data: ")) { + try { + const eventData = JSON.parse(line.slice(6)) + + if (eventData.type === "text_delta") { + setTranscript((prev) => prev + eventData.data) + } else if (eventData.type === "tool_output") { + if (eventData.data.type === "image") { + overlayItems.push(eventData.data) + } + } else if (eventData.type === "done") { + console.log("Done") + } + } catch (e) { + console.error("💥 Failed to parse SSE event:", line, e) + } + } else if (line.trim()) { + console.error("💥 Non-data line:", line) + } + } } + + setOverlays(overlayItems) } catch (error) { console.error("Error during streaming:", error) setAudioError(`Streaming failed: ${error}`) @@ -70,5 +102,6 @@ export function useStreamingAI() { waitingForResponse, startRecording, endRecording, + overlays, } } diff --git a/packages/whiteboard/src/videoOverlay.tsx b/packages/whiteboard/src/videoOverlay.tsx index d47bcb3..33a36df 100644 --- a/packages/whiteboard/src/videoOverlay.tsx +++ b/packages/whiteboard/src/videoOverlay.tsx @@ -1,27 +1,5 @@ import { useRef, useEffect } from "hono/jsx" - -export interface TextOverlay { - type: "text" - x: number - y: number - text: string - fontSize?: number - fontFamily?: string - color?: string - strokeColor?: string - strokeWidth?: number -} - -export interface ImageOverlay { - type: "image" - x: number - y: number - src: string - width?: number - height?: number -} - -export type OverlayItem = TextOverlay | ImageOverlay +import type { ImageOverlay, OverlayItem, TextOverlay } from "./types" interface VideoOverlayProps { overlays: OverlayItem[] @@ -50,7 +28,17 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr canvas.height = rect.height // Clear canvas - ctx.clearRect(0, 0, canvas.width, canvas.height) + // ctx.clearRect(0, 0, canvas.width, canvas.height) + // ctx.fillStyle = "green" + // const xmin = 250 + // const ymin = 250 + // const xmax = 750 + // const ymax = 750 + // const x = (xmin / 1000) * ctx.canvas.width + // const y = (ymin / 1000) * ctx.canvas.height + // const width = ((xmax - xmin) / 1000) * ctx.canvas.width + // const height = ((ymax - ymin) / 1000) * ctx.canvas.height + // ctx.fillRect(x, y, width, height) // Draw overlays for (const overlay of overlays) { @@ -65,7 +53,6 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr // Redraw when overlay data changes or recording state changes useEffect(() => { setTimeout(() => { - console.log(`🌭 `, canvasRef.current?.width, canvasRef.current?.height) drawOverlays() }, 1000) }, [overlays, isRecording]) @@ -78,8 +65,10 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr const drawText = (ctx: CanvasRenderingContext2D, overlay: TextOverlay) => { const { - x, - y, + xmin, + ymin, + xmax, + ymax, text, fontSize = 20, fontFamily = "Arial", @@ -92,21 +81,26 @@ export function VideoOverlay({ overlays, children, isRecording }: VideoOverlayPr ctx.fillStyle = color ctx.strokeStyle = strokeColor ctx.lineWidth = strokeWidth - + const x = (xmin / 1000) * ctx.canvas.width + const y = (ymin / 1000) * ctx.canvas.height + const width = ((xmax - xmin) / 1000) * ctx.canvas.width + const height = ((ymax - ymin) / 1000) * ctx.canvas.height ctx.strokeText(text, x, y) ctx.fillText(text, x, y) } const drawImage = (ctx: CanvasRenderingContext2D, overlay: ImageOverlay) => { - const { x, y, src, width, height } = overlay + const { xmin, ymin, xmax, ymax, src } = overlay const img = new Image() img.crossOrigin = "anonymous" img.onload = () => { - const drawWidth = width || img.width - const drawHeight = height || img.height - ctx.drawImage(img, x, y, drawWidth, drawHeight) + const x = (xmin / 1000) * ctx.canvas.width + const y = (ymin / 1000) * ctx.canvas.height + const width = ((xmax - xmin) / 1000) * ctx.canvas.width + const height = ((ymax - ymin) / 1000) * ctx.canvas.height + ctx.drawImage(img, x, y, width, height) } img.src = src