diff --git a/packages/nano-remix/src/clientHelpers.tsx b/packages/nano-remix/src/clientHelpers.tsx index dc76ad0..d3ef3f4 100644 --- a/packages/nano-remix/src/clientHelpers.tsx +++ b/packages/nano-remix/src/clientHelpers.tsx @@ -115,7 +115,6 @@ export const Form = (props: FormProps) => { } else if (res.ok) { const { actionData, loaderData } = (await res.json()) as any window._setLoaderData!(loaderData) - actionFns.setData(actionData) } else { const errorText = await res.text() diff --git a/packages/whiteboard/public/multi-color.png b/packages/whiteboard/public/multi-color.png new file mode 100644 index 0000000..2c1c5e7 Binary files /dev/null and b/packages/whiteboard/public/multi-color.png differ diff --git a/packages/whiteboard/public/red.png b/packages/whiteboard/public/red.png new file mode 100644 index 0000000..fb968fa Binary files /dev/null and b/packages/whiteboard/public/red.png differ diff --git a/packages/whiteboard/public/rough.png b/packages/whiteboard/public/rough.png new file mode 100644 index 0000000..4a67b73 Binary files /dev/null and b/packages/whiteboard/public/rough.png differ diff --git a/packages/whiteboard/public/whiteboard.png b/packages/whiteboard/public/whiteboard.png index 62170c0..2c1c5e7 100644 Binary files a/packages/whiteboard/public/whiteboard.png and b/packages/whiteboard/public/whiteboard.png differ diff --git a/packages/whiteboard/src/agent.ts b/packages/whiteboard/src/agent.ts new file mode 100644 index 0000000..217ce14 --- /dev/null +++ b/packages/whiteboard/src/agent.ts @@ -0,0 +1,23 @@ +import { tool, RealtimeAgent } from "@openai/agents/realtime" +import { run } from "@openai/agents" + +// 1. Define a tool to fetch the latest whiteboard image +const fetchWhiteboard = tool({ + name: "fetchWhiteboard", + description: "Fetch the latest whiteboard image and return its bytes", + parameters: undefined, + execute: async () => { + return await Bun.file("public/whiteboard.png").arrayBuffer() + }, +}) + +async function main() { + const agent = new RealtimeAgent({ + name: "Spike", + instructions: "When asked to analyze the whiteboard, call fetchWhiteboard", + tools: [fetchWhiteboard], + }) + + const result = await run(agent, "Hey Spike, analyze the whiteboard.") + console.log("Agent response:", result.finalOutput) +} diff --git a/packages/whiteboard/src/opencv.ts b/packages/whiteboard/src/opencv.ts index f8039f3..4e891e1 100644 --- a/packages/whiteboard/src/opencv.ts +++ b/packages/whiteboard/src/opencv.ts @@ -11,14 +11,14 @@ type Element = { type StructuredResponse = { elements: Element[] } export const detectShapes = async ( - imgBuffer: ArrayBuffer, - minAreaPercent = 0.5, - maxAreaPercent = 15 + imageBuffer: ArrayBuffer, + minAreaPercent = 5, + maxAreaPercent = 33 ): Promise => { const cv = await cvReady - // 1. Decode PNG from ArrayBuffer → raw RGBA buffer - const buf = Buffer.from(imgBuffer) + // 1. Load & decode PNG → raw RGBA buffer + const buf = Buffer.from(imageBuffer) const { width, height, data } = PNG.sync.read(buf) // 2. Create a 4-ch Mat from RGBA pixels @@ -31,6 +31,20 @@ export const detectShapes = async ( const thresh = new cv.Mat() cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2) + // Morphological opening to remove small noise + const removeNoise = (mat: cvReady.Mat, kSize = 3) => { + const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize)) + cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel) + kernel.delete() + } + // Morphological closing to bridge gaps in contours + const closeGaps = (mat: cvReady.Mat, kSize = 7) => { + const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize)) + cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel) + kernel.delete() + } + removeNoise(thresh, 3) + closeGaps(thresh, 7) // 4. Find contours const contours = new cv.MatVector() @@ -44,15 +58,31 @@ export const detectShapes = async ( for (let i = 0; i < contours.size(); i++) { const cnt = contours.get(i) const rect = cv.boundingRect(cnt) - const contourArea = cv.contourArea(cnt) - const areaPercent = (contourArea / totalImageArea) * 100 - // Basic filtering - if (areaPercent < minAreaPercent || areaPercent > maxAreaPercent) { + // Skip shapes whose bounding box touches the image border + if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) { + // console.log( + // `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})` + // ) cnt.delete() continue } + // Calculate area based on bounding box + const rectArea = rect.width * rect.height + const areaPercent = (rectArea / totalImageArea) * 100 + + // Basic filtering (lower bound only; upper bound filter disabled) + if (areaPercent < minAreaPercent) { + // cnt.delete() + continue + } else if (areaPercent > maxAreaPercent) { + // cnt.delete() + continue + } + // console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`) + + /* const margin = Math.min(width, height) * 0.05 if ( rect.x < margin || @@ -60,9 +90,10 @@ export const detectShapes = async ( rect.x + rect.width > width - margin || rect.y + rect.height > height - margin ) { - cnt.delete() + // cnt.delete() continue } + */ // Simple shape classification const peri = cv.arcLength(cnt, true) @@ -84,6 +115,12 @@ export const detectShapes = async ( label, }) + console.log( + `-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${ + rect.height + }) area=${areaPercent.toFixed(2)}%` + ) + cnt.delete() approx.delete() } diff --git a/packages/whiteboard/src/routes/index.tsx b/packages/whiteboard/src/routes/index.tsx index 0f6e23e..aba564f 100644 --- a/packages/whiteboard/src/routes/index.tsx +++ b/packages/whiteboard/src/routes/index.tsx @@ -5,22 +5,26 @@ import { getGeminiResponse } from "../ai" import result from "../result.json" import { detectShapes } from "../opencv" -const categories = ["hand drawn circle", "hand drawn square", "hand drawn arrow"] +const categories = [ + "hand drawn circle", + "hand drawn square", + "hand drawn arrow", + "hand drawn triangle", + "hand drawn rectangle", + "hand drawn polygon", +] const prompts = { default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`, simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`, } export const action = async (req: Request, params: {}) => { - const url = new URL(req.url) - const imageUrl = new URL("whiteboard.png", url.origin).toString() - const imageResponse = await fetch(imageUrl) - const imageBuffer = await imageResponse.arrayBuffer() - // const response = await getGeminiResponse(imageBuffer, prompts.default) + const imageBuffer = await Bun.file("public/whiteboard.png").arrayBuffer() + const response = await getGeminiResponse(imageBuffer, prompts.default) // return { elements: response?.elements || [] } - const response = await detectShapes(imageBuffer) - return { elements: response.elements } + // const response = await detectShapes(imageBuffer) + return { elements: response!.elements } } export default function Index() { diff --git a/packages/whiteboard/src/routes/realtime.tsx b/packages/whiteboard/src/routes/realtime.tsx new file mode 100644 index 0000000..932471a --- /dev/null +++ b/packages/whiteboard/src/routes/realtime.tsx @@ -0,0 +1,64 @@ +import { Form, useAction } from "@workshop/nano-remix" +import { useEffect, useRef } from "hono/jsx" +import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime" +import { ensure } from "@workshop/shared/utils" + +export const action = async (request: Request) => { + const response = await fetch("https://api.openai.com/v1/realtime/sessions", { + method: "POST", + headers: { + Authorization: `Bearer ${process.env.OPENAI_API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: "gpt-4o-realtime-preview-2025-06-03", + }), + }) + const { client_secret } = await response.json() + + return { secret: client_secret?.value } +} + +export default function Voice() { + const { data, loading, error } = useAction() + const session = useRef(undefined) + + useEffect(() => { + if (!data?.secret) return + if (session.current) return + + session.current = createSession() + session.current.connect({ apiKey: data.secret }) + }, [data?.secret]) + + return ( +
+ {error &&

Error: {error}

} +

Ephemeral Key: {loading ? "Loading..." : data?.secret}

+
+ +
+
+ ) +} + +const createSession = () => { + const agent = new RealtimeAgent({ + name: "Assistant", + voice: "echo", + instructions: ` +You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things. + +# Voice Tone + +You have a very quiet and have a slight accent that is hard to place. + +`, + }) + const session = new RealtimeSession(agent) + session.on("error", (error) => { + console.error("Session error:", error) + }) + + return session +} diff --git a/packages/whiteboard/src/routes/upload.tsx b/packages/whiteboard/src/routes/upload.tsx index d40a264..5e0376b 100644 --- a/packages/whiteboard/src/routes/upload.tsx +++ b/packages/whiteboard/src/routes/upload.tsx @@ -43,9 +43,8 @@ export default function Camera() { const canvas = canvasRef.current const video = videoRef.current - // Downscale to max 320x240 - const maxWidth = 320 - const maxHeight = 240 + const maxWidth = 1000 + const maxHeight = 1000 const aspectRatio = video.videoWidth / video.videoHeight let newWidth = maxWidth diff --git a/packages/whiteboard/src/routes/voice.tsx b/packages/whiteboard/src/routes/voice.tsx new file mode 100644 index 0000000..a3764ea --- /dev/null +++ b/packages/whiteboard/src/routes/voice.tsx @@ -0,0 +1,62 @@ +import { useEffect, useRef, useState } from "hono/jsx" +import { StreamingResponse } from "../streamingAI" + +export default function Voice() { + const [audioError, setAudioError] = useState("") + const [transcript, setTranscript] = useState("") + const [isRecording, setIsRecording] = useState(false) + const streamingResponseRef = useRef(null) + + const startRecording = async () => { + setAudioError("") + setTranscript("") + streamingResponseRef.current = new StreamingResponse((error) => setAudioError(error)) + await streamingResponseRef.current.start() + setIsRecording(true) + } + + const endRecording = async () => { + setIsRecording(false) + try { + const reader = await streamingResponseRef.current!.stop() + const decoder = new TextDecoder() + + while (true) { + const { done, value } = await reader.read() + if (done) break + + const chunk = decoder.decode(value, { stream: true }) + setTranscript((prev) => prev + chunk) + } + } catch (error) { + console.error("Error during streaming:", error) + setAudioError(`Streaming failed: ${error}`) + } + } + + useEffect(() => { + return () => endRecording() + }, []) + + return ( +
+ {audioError &&

Audio Error: {audioError}

} + +
+

Audio Recording

+ + + {isRecording &&

🎤 Recording...

} +
+ + {transcript && ( +
+

Transcript:

+

{transcript}

+
+ )} +
+ ) +} diff --git a/packages/whiteboard/src/server.ts b/packages/whiteboard/src/server.ts index f6d4597..e73b932 100644 --- a/packages/whiteboard/src/server.ts +++ b/packages/whiteboard/src/server.ts @@ -1,4 +1,8 @@ import { nanoRemix } from "@workshop/nano-remix" +import { OpenAI } from "openai" +import { Agent, run, type AgentInputItem } from "@openai/agents" +import fs from "node:fs" +import { getErrorMessage } from "@workshop/shared/errors" Bun.serve({ port: 3000, @@ -8,8 +12,74 @@ Bun.serve({ cert: Bun.file("certs/cert.pem"), }, routes: { + "/api/streamResponse": async (req) => { + try { + return streamResponse(req) + } catch (error) { + console.error("Transcription error:", error) + return new Response(`Transcription failed: ${getErrorMessage(error)}`, { status: 500 }) + } + }, "/*": (req) => { return nanoRemix(req) }, }, }) + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}) + +const streamResponse = async (req: Request) => { + const transcript = await transcribeAudio(req) + + const agent = new Agent({ + name: "Whiteboard Assistant", + model: "gpt-4o", + instructions: "You are a helpful assistant that talks about a whiteboard.", + }) + + const imagePath = "public/whiteboard.png" + const base64Image = fs.readFileSync(imagePath, "base64") + const input: AgentInputItem[] = [ + { + role: "user", + type: "message", + content: [ + { type: "input_image", image: `data:image/png;base64,${base64Image}` }, + { type: "input_text", text: transcript }, + ], + }, + ] + + const result = await run(agent, input, { stream: true }) + const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any + + return new Response(readableStream, { + headers: { + "Content-Type": "text/plain", + "Cache-Control": "no-cache", + Connection: "keep-alive", + }, + }) +} + +const transcribeAudio = async (req: Request) => { + if (req.method !== "POST") { + throw new Error("Method not allowed, only POST is supported") + } else if (!req.body) { + throw new Error("No audio data provided") + } + + const response = new Response(req.body) + const audioBlob = await response.blob() + const audioFile = new File([audioBlob], "audio.webm", { type: "audio/webm" }) + + const transcript = await openai.audio.transcriptions.create({ + file: audioFile, + model: "gpt-4o-mini-transcribe", + response_format: "text", + }) + + return transcript +} diff --git a/packages/whiteboard/src/streamingAI.ts b/packages/whiteboard/src/streamingAI.ts new file mode 100644 index 0000000..d6ffc55 --- /dev/null +++ b/packages/whiteboard/src/streamingAI.ts @@ -0,0 +1,78 @@ +import { getErrorMessage } from "@workshop/shared/errors" + +export class StreamingResponse { + private mediaRecorder?: MediaRecorder + private mediaStream?: MediaStream + private audioChunks: Blob[] = [] + private isRecording = false + + constructor(private onError: (error: string) => void) {} + + async start() { + try { + if (this.isRecording) return + + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 16000 }, + }) + + this.mediaRecorder = new MediaRecorder(this.mediaStream, { mimeType: "audio/webm" }) + this.audioChunks = [] + + this.mediaRecorder.addEventListener("dataavailable", (event) => { + if (event.data.size === 0) return + console.log("Audio chunk received:", event.data.size, "bytes") + this.audioChunks.push(event.data) + }) + + this.mediaRecorder.addEventListener("stop", async () => { + this.mediaStream?.getTracks().forEach((track) => track.stop()) + this.isRecording = false + }) + + this.mediaRecorder.start(1000) + this.isRecording = true + } catch (error) { + this.mediaRecorder?.stop() + + console.error("Error starting recording:", error) + this.onError(`Failed to start recording: ${getErrorMessage(error)}`) + } + } + + async stop() { + return new Promise>((resolve, reject) => { + if (!this.mediaRecorder || !this.isRecording) { + reject("No media recorder is active") + return + } + + this.mediaRecorder.addEventListener("stop", async () => { + try { + const audioBlob = new Blob(this.audioChunks, { type: "audio/webm" }) + const stream = await this.streamResponse(audioBlob) + resolve(stream) + } catch (error) { + reject(`Failed to process audio stream: ${getErrorMessage(error)}`) + } + }) + + this.mediaRecorder.stop() + }) + } + + private async streamResponse(audioBlob: Blob) { + const response = await fetch("/api/streamResponse", { method: "POST", body: audioBlob }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Server error: ${response.status} - ${errorText}`) + } + + return response.body!.getReader() + } + + getIsRecording() { + return this.isRecording + } +}