it is working

2025-07-19 11:50:06 -07:00 · 2025-07-19 11:50:06 -07:00 · 0721f2ead1
commit 0721f2ead1
parent fe247d7eb1
13 changed files with 358 additions and 22 deletions
--- a/packages/nano-remix/src/clientHelpers.tsx
+++ b/packages/nano-remix/src/clientHelpers.tsx
@ -115,7 +115,6 @@ export const Form = (props: FormProps) => {
      } else if (res.ok) {
        const { actionData, loaderData } = (await res.json()) as any
        window._setLoaderData!(loaderData)
-
        actionFns.setData(actionData)
      } else {
        const errorText = await res.text()
--- a/packages/whiteboard/public/multi-color.png
+++ b/packages/whiteboard/public/multi-color.png
--- a/packages/whiteboard/public/red.png
+++ b/packages/whiteboard/public/red.png
--- a/packages/whiteboard/public/rough.png
+++ b/packages/whiteboard/public/rough.png
--- a/packages/whiteboard/public/whiteboard.png
+++ b/packages/whiteboard/public/whiteboard.png
--- a/packages/whiteboard/src/agent.ts
+++ b/packages/whiteboard/src/agent.ts
@ -0,0 +1,23 @@
+import { tool, RealtimeAgent } from "@openai/agents/realtime"
+import { run } from "@openai/agents"
+
+// 1. Define a tool to fetch the latest whiteboard image
+const fetchWhiteboard = tool({
+  name: "fetchWhiteboard",
+  description: "Fetch the latest whiteboard image and return its bytes",
+  parameters: undefined,
+  execute: async () => {
+    return await Bun.file("public/whiteboard.png").arrayBuffer()
+  },
+})
+
+async function main() {
+  const agent = new RealtimeAgent({
+    name: "Spike",
+    instructions: "When asked to analyze the whiteboard, call fetchWhiteboard",
+    tools: [fetchWhiteboard],
+  })
+
+  const result = await run(agent, "Hey Spike, analyze the whiteboard.")
+  console.log("Agent response:", result.finalOutput)
+}
--- a/packages/whiteboard/src/opencv.ts
+++ b/packages/whiteboard/src/opencv.ts
@ -11,14 +11,14 @@ type Element = {
 type StructuredResponse = { elements: Element[] }

 export const detectShapes = async (
-  imgBuffer: ArrayBuffer,
-  minAreaPercent = 0.5,
-  maxAreaPercent = 15
+  imageBuffer: ArrayBuffer,
+  minAreaPercent = 5,
+  maxAreaPercent = 33
 ): Promise<StructuredResponse> => {
  const cv = await cvReady

-  // 1. Decode PNG from ArrayBuffer → raw RGBA buffer
-  const buf = Buffer.from(imgBuffer)
+  // 1. Load & decode PNG → raw RGBA buffer
+  const buf = Buffer.from(imageBuffer)
  const { width, height, data } = PNG.sync.read(buf)

  // 2. Create a 4-ch Mat from RGBA pixels
@ -31,6 +31,20 @@ export const detectShapes = async (

  const thresh = new cv.Mat()
  cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2)
+  // Morphological opening to remove small noise
+  const removeNoise = (mat: cvReady.Mat, kSize = 3) => {
+    const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
+    cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel)
+    kernel.delete()
+  }
+  // Morphological closing to bridge gaps in contours
+  const closeGaps = (mat: cvReady.Mat, kSize = 7) => {
+    const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
+    cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel)
+    kernel.delete()
+  }
+  removeNoise(thresh, 3)
+  closeGaps(thresh, 7)

  // 4. Find contours
  const contours = new cv.MatVector()
@ -44,15 +58,31 @@ export const detectShapes = async (
  for (let i = 0; i < contours.size(); i++) {
    const cnt = contours.get(i)
    const rect = cv.boundingRect(cnt)
-    const contourArea = cv.contourArea(cnt)
-    const areaPercent = (contourArea / totalImageArea) * 100

-    // Basic filtering
-    if (areaPercent < minAreaPercent || areaPercent > maxAreaPercent) {
+    // Skip shapes whose bounding box touches the image border
+    if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) {
+      // console.log(
+      //   `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})`
+      // )
      cnt.delete()
      continue
    }

+    // Calculate area based on bounding box
+    const rectArea = rect.width * rect.height
+    const areaPercent = (rectArea / totalImageArea) * 100
+
+    // Basic filtering (lower bound only; upper bound filter disabled)
+    if (areaPercent < minAreaPercent) {
+      // cnt.delete()
+      continue
+    } else if (areaPercent > maxAreaPercent) {
+      // cnt.delete()
+      continue
+    }
+    // console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`)
+
+    /*
    const margin = Math.min(width, height) * 0.05
    if (
      rect.x < margin ||
@ -60,9 +90,10 @@ export const detectShapes = async (
      rect.x + rect.width > width - margin ||
      rect.y + rect.height > height - margin
    ) {
-      cnt.delete()
+      // cnt.delete()
      continue
    }
+    */

    // Simple shape classification
    const peri = cv.arcLength(cnt, true)
@ -84,6 +115,12 @@ export const detectShapes = async (
      label,
    })

+    console.log(
+      `-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${
+        rect.height
+      }) area=${areaPercent.toFixed(2)}%`
+    )
+
    cnt.delete()
    approx.delete()
  }
--- a/packages/whiteboard/src/routes/index.tsx
+++ b/packages/whiteboard/src/routes/index.tsx
@ -5,22 +5,26 @@ import { getGeminiResponse } from "../ai"
 import result from "../result.json"
 import { detectShapes } from "../opencv"

-const categories = ["hand drawn circle", "hand drawn square", "hand drawn arrow"]
+const categories = [
+  "hand drawn circle",
+  "hand drawn square",
+  "hand drawn arrow",
+  "hand drawn triangle",
+  "hand drawn rectangle",
+  "hand drawn polygon",
+]
 const prompts = {
  default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`,
  simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`,
 }

 export const action = async (req: Request, params: {}) => {
-  const url = new URL(req.url)
-  const imageUrl = new URL("whiteboard.png", url.origin).toString()
-  const imageResponse = await fetch(imageUrl)
-  const imageBuffer = await imageResponse.arrayBuffer()
-  // const response = await getGeminiResponse(imageBuffer, prompts.default)
+  const imageBuffer = await Bun.file("public/whiteboard.png").arrayBuffer()
+  const response = await getGeminiResponse(imageBuffer, prompts.default)
  // return { elements: response?.elements || [] }

-  const response = await detectShapes(imageBuffer)
-  return { elements: response.elements }
+  // const response = await detectShapes(imageBuffer)
+  return { elements: response!.elements }
 }

 export default function Index() {
--- a/packages/whiteboard/src/routes/realtime.tsx
+++ b/packages/whiteboard/src/routes/realtime.tsx
@ -0,0 +1,64 @@
+import { Form, useAction } from "@workshop/nano-remix"
+import { useEffect, useRef } from "hono/jsx"
+import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime"
+import { ensure } from "@workshop/shared/utils"
+
+export const action = async (request: Request) => {
+  const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: "gpt-4o-realtime-preview-2025-06-03",
+    }),
+  })
+  const { client_secret } = await response.json()
+
+  return { secret: client_secret?.value }
+}
+
+export default function Voice() {
+  const { data, loading, error } = useAction<typeof action>()
+  const session = useRef<RealtimeSession | undefined>(undefined)
+
+  useEffect(() => {
+    if (!data?.secret) return
+    if (session.current) return
+
+    session.current = createSession()
+    session.current.connect({ apiKey: data.secret })
+  }, [data?.secret])
+
+  return (
+    <div>
+      {error && <p>Error: {error}</p>}
+      <p>Ephemeral Key: {loading ? "Loading..." : data?.secret}</p>
+      <Form name="voiceForm">
+        <button type="submit">Start Voice Session</button>
+      </Form>
+    </div>
+  )
+}
+
+const createSession = () => {
+  const agent = new RealtimeAgent({
+    name: "Assistant",
+    voice: "echo",
+    instructions: `
+You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things.
+
+# Voice Tone
+
+You have a very quiet and have a slight accent that is hard to place.
+
+`,
+  })
+  const session = new RealtimeSession(agent)
+  session.on("error", (error) => {
+    console.error("Session error:", error)
+  })
+
+  return session
+}
--- a/packages/whiteboard/src/routes/upload.tsx
+++ b/packages/whiteboard/src/routes/upload.tsx
@ -43,9 +43,8 @@ export default function Camera() {
    const canvas = canvasRef.current
    const video = videoRef.current

-    // Downscale to max 320x240
-    const maxWidth = 320
-    const maxHeight = 240
+    const maxWidth = 1000
+    const maxHeight = 1000
    const aspectRatio = video.videoWidth / video.videoHeight

    let newWidth = maxWidth
--- a/packages/whiteboard/src/routes/voice.tsx
+++ b/packages/whiteboard/src/routes/voice.tsx
@ -0,0 +1,62 @@
+import { useEffect, useRef, useState } from "hono/jsx"
+import { StreamingResponse } from "../streamingAI"
+
+export default function Voice() {
+  const [audioError, setAudioError] = useState<string>("")
+  const [transcript, setTranscript] = useState<string>("")
+  const [isRecording, setIsRecording] = useState(false)
+  const streamingResponseRef = useRef<StreamingResponse>(null)
+
+  const startRecording = async () => {
+    setAudioError("")
+    setTranscript("")
+    streamingResponseRef.current = new StreamingResponse((error) => setAudioError(error))
+    await streamingResponseRef.current.start()
+    setIsRecording(true)
+  }
+
+  const endRecording = async () => {
+    setIsRecording(false)
+    try {
+      const reader = await streamingResponseRef.current!.stop()
+      const decoder = new TextDecoder()
+
+      while (true) {
+        const { done, value } = await reader.read()
+        if (done) break
+
+        const chunk = decoder.decode(value, { stream: true })
+        setTranscript((prev) => prev + chunk)
+      }
+    } catch (error) {
+      console.error("Error during streaming:", error)
+      setAudioError(`Streaming failed: ${error}`)
+    }
+  }
+
+  useEffect(() => {
+    return () => endRecording()
+  }, [])
+
+  return (
+    <div>
+      {audioError && <p>Audio Error: {audioError}</p>}
+
+      <div>
+        <h3>Audio Recording</h3>
+        <button onClick={isRecording ? endRecording : startRecording}>
+          {isRecording ? "Stop Recording" : "Start Recording"}
+        </button>
+
+        {isRecording && <p>🎤 Recording...</p>}
+      </div>
+
+      {transcript && (
+        <div>
+          <h4>Transcript:</h4>
+          <p>{transcript}</p>
+        </div>
+      )}
+    </div>
+  )
+}
--- a/packages/whiteboard/src/server.ts
+++ b/packages/whiteboard/src/server.ts
@ -1,4 +1,8 @@
 import { nanoRemix } from "@workshop/nano-remix"
+import { OpenAI } from "openai"
+import { Agent, run, type AgentInputItem } from "@openai/agents"
+import fs from "node:fs"
+import { getErrorMessage } from "@workshop/shared/errors"

 Bun.serve({
  port: 3000,
@ -8,8 +12,74 @@ Bun.serve({
    cert: Bun.file("certs/cert.pem"),
  },
  routes: {
+    "/api/streamResponse": async (req) => {
+      try {
+        return streamResponse(req)
+      } catch (error) {
+        console.error("Transcription error:", error)
+        return new Response(`Transcription failed: ${getErrorMessage(error)}`, { status: 500 })
+      }
+    },
    "/*": (req) => {
      return nanoRemix(req)
    },
  },
 })
+
+const openai = new OpenAI({
+  apiKey: process.env.OPENAI_API_KEY,
+})
+
+const streamResponse = async (req: Request) => {
+  const transcript = await transcribeAudio(req)
+
+  const agent = new Agent({
+    name: "Whiteboard Assistant",
+    model: "gpt-4o",
+    instructions: "You are a helpful assistant that talks about a whiteboard.",
+  })
+
+  const imagePath = "public/whiteboard.png"
+  const base64Image = fs.readFileSync(imagePath, "base64")
+  const input: AgentInputItem[] = [
+    {
+      role: "user",
+      type: "message",
+      content: [
+        { type: "input_image", image: `data:image/png;base64,${base64Image}` },
+        { type: "input_text", text: transcript },
+      ],
+    },
+  ]
+
+  const result = await run(agent, input, { stream: true })
+  const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any
+
+  return new Response(readableStream, {
+    headers: {
+      "Content-Type": "text/plain",
+      "Cache-Control": "no-cache",
+      Connection: "keep-alive",
+    },
+  })
+}
+
+const transcribeAudio = async (req: Request) => {
+  if (req.method !== "POST") {
+    throw new Error("Method not allowed, only POST is supported")
+  } else if (!req.body) {
+    throw new Error("No audio data provided")
+  }
+
+  const response = new Response(req.body)
+  const audioBlob = await response.blob()
+  const audioFile = new File([audioBlob], "audio.webm", { type: "audio/webm" })
+
+  const transcript = await openai.audio.transcriptions.create({
+    file: audioFile,
+    model: "gpt-4o-mini-transcribe",
+    response_format: "text",
+  })
+
+  return transcript
+}
--- a/packages/whiteboard/src/streamingAI.ts
+++ b/packages/whiteboard/src/streamingAI.ts
@ -0,0 +1,78 @@
+import { getErrorMessage } from "@workshop/shared/errors"
+
+export class StreamingResponse {
+  private mediaRecorder?: MediaRecorder
+  private mediaStream?: MediaStream
+  private audioChunks: Blob[] = []
+  private isRecording = false
+
+  constructor(private onError: (error: string) => void) {}
+
+  async start() {
+    try {
+      if (this.isRecording) return
+
+      this.mediaStream = await navigator.mediaDevices.getUserMedia({
+        audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 16000 },
+      })
+
+      this.mediaRecorder = new MediaRecorder(this.mediaStream, { mimeType: "audio/webm" })
+      this.audioChunks = []
+
+      this.mediaRecorder.addEventListener("dataavailable", (event) => {
+        if (event.data.size === 0) return
+        console.log("Audio chunk received:", event.data.size, "bytes")
+        this.audioChunks.push(event.data)
+      })
+
+      this.mediaRecorder.addEventListener("stop", async () => {
+        this.mediaStream?.getTracks().forEach((track) => track.stop())
+        this.isRecording = false
+      })
+
+      this.mediaRecorder.start(1000)
+      this.isRecording = true
+    } catch (error) {
+      this.mediaRecorder?.stop()
+
+      console.error("Error starting recording:", error)
+      this.onError(`Failed to start recording: ${getErrorMessage(error)}`)
+    }
+  }
+
+  async stop() {
+    return new Promise<ReadableStreamDefaultReader<Uint8Array>>((resolve, reject) => {
+      if (!this.mediaRecorder || !this.isRecording) {
+        reject("No media recorder is active")
+        return
+      }
+
+      this.mediaRecorder.addEventListener("stop", async () => {
+        try {
+          const audioBlob = new Blob(this.audioChunks, { type: "audio/webm" })
+          const stream = await this.streamResponse(audioBlob)
+          resolve(stream)
+        } catch (error) {
+          reject(`Failed to process audio stream: ${getErrorMessage(error)}`)
+        }
+      })
+
+      this.mediaRecorder.stop()
+    })
+  }
+
+  private async streamResponse(audioBlob: Blob) {
+    const response = await fetch("/api/streamResponse", { method: "POST", body: audioBlob })
+
+    if (!response.ok) {
+      const errorText = await response.text()
+      throw new Error(`Server error: ${response.status} - ${errorText}`)
+    }
+
+    return response.body!.getReader()
+  }
+
+  getIsRecording() {
+    return this.isRecording
+  }
+}