it is working

This commit is contained in:
Corey Johnson 2025-07-19 11:50:06 -07:00
parent fe247d7eb1
commit 0721f2ead1
13 changed files with 358 additions and 22 deletions

View File

@ -115,7 +115,6 @@ export const Form = (props: FormProps) => {
} else if (res.ok) {
const { actionData, loaderData } = (await res.json()) as any
window._setLoaderData!(loaderData)
actionFns.setData(actionData)
} else {
const errorText = await res.text()

Binary file not shown.

After

Width:  |  Height:  |  Size: 897 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 892 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 890 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

After

Width:  |  Height:  |  Size: 897 KiB

View File

@ -0,0 +1,23 @@
import { tool, RealtimeAgent } from "@openai/agents/realtime"
import { run } from "@openai/agents"
// 1. Define a tool to fetch the latest whiteboard image
const fetchWhiteboard = tool({
name: "fetchWhiteboard",
description: "Fetch the latest whiteboard image and return its bytes",
parameters: undefined,
execute: async () => {
return await Bun.file("public/whiteboard.png").arrayBuffer()
},
})
async function main() {
const agent = new RealtimeAgent({
name: "Spike",
instructions: "When asked to analyze the whiteboard, call fetchWhiteboard",
tools: [fetchWhiteboard],
})
const result = await run(agent, "Hey Spike, analyze the whiteboard.")
console.log("Agent response:", result.finalOutput)
}

View File

@ -11,14 +11,14 @@ type Element = {
type StructuredResponse = { elements: Element[] }
export const detectShapes = async (
imgBuffer: ArrayBuffer,
minAreaPercent = 0.5,
maxAreaPercent = 15
imageBuffer: ArrayBuffer,
minAreaPercent = 5,
maxAreaPercent = 33
): Promise<StructuredResponse> => {
const cv = await cvReady
// 1. Decode PNG from ArrayBuffer → raw RGBA buffer
const buf = Buffer.from(imgBuffer)
// 1. Load & decode PNG → raw RGBA buffer
const buf = Buffer.from(imageBuffer)
const { width, height, data } = PNG.sync.read(buf)
// 2. Create a 4-ch Mat from RGBA pixels
@ -31,6 +31,20 @@ export const detectShapes = async (
const thresh = new cv.Mat()
cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2)
// Morphological opening to remove small noise
const removeNoise = (mat: cvReady.Mat, kSize = 3) => {
const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel)
kernel.delete()
}
// Morphological closing to bridge gaps in contours
const closeGaps = (mat: cvReady.Mat, kSize = 7) => {
const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel)
kernel.delete()
}
removeNoise(thresh, 3)
closeGaps(thresh, 7)
// 4. Find contours
const contours = new cv.MatVector()
@ -44,15 +58,31 @@ export const detectShapes = async (
for (let i = 0; i < contours.size(); i++) {
const cnt = contours.get(i)
const rect = cv.boundingRect(cnt)
const contourArea = cv.contourArea(cnt)
const areaPercent = (contourArea / totalImageArea) * 100
// Basic filtering
if (areaPercent < minAreaPercent || areaPercent > maxAreaPercent) {
// Skip shapes whose bounding box touches the image border
if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) {
// console.log(
// `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})`
// )
cnt.delete()
continue
}
// Calculate area based on bounding box
const rectArea = rect.width * rect.height
const areaPercent = (rectArea / totalImageArea) * 100
// Basic filtering (lower bound only; upper bound filter disabled)
if (areaPercent < minAreaPercent) {
// cnt.delete()
continue
} else if (areaPercent > maxAreaPercent) {
// cnt.delete()
continue
}
// console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`)
/*
const margin = Math.min(width, height) * 0.05
if (
rect.x < margin ||
@ -60,9 +90,10 @@ export const detectShapes = async (
rect.x + rect.width > width - margin ||
rect.y + rect.height > height - margin
) {
cnt.delete()
// cnt.delete()
continue
}
*/
// Simple shape classification
const peri = cv.arcLength(cnt, true)
@ -84,6 +115,12 @@ export const detectShapes = async (
label,
})
console.log(
`-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${
rect.height
}) area=${areaPercent.toFixed(2)}%`
)
cnt.delete()
approx.delete()
}

View File

@ -5,22 +5,26 @@ import { getGeminiResponse } from "../ai"
import result from "../result.json"
import { detectShapes } from "../opencv"
const categories = ["hand drawn circle", "hand drawn square", "hand drawn arrow"]
const categories = [
"hand drawn circle",
"hand drawn square",
"hand drawn arrow",
"hand drawn triangle",
"hand drawn rectangle",
"hand drawn polygon",
]
const prompts = {
default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`,
simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`,
}
export const action = async (req: Request, params: {}) => {
const url = new URL(req.url)
const imageUrl = new URL("whiteboard.png", url.origin).toString()
const imageResponse = await fetch(imageUrl)
const imageBuffer = await imageResponse.arrayBuffer()
// const response = await getGeminiResponse(imageBuffer, prompts.default)
const imageBuffer = await Bun.file("public/whiteboard.png").arrayBuffer()
const response = await getGeminiResponse(imageBuffer, prompts.default)
// return { elements: response?.elements || [] }
const response = await detectShapes(imageBuffer)
return { elements: response.elements }
// const response = await detectShapes(imageBuffer)
return { elements: response!.elements }
}
export default function Index() {

View File

@ -0,0 +1,64 @@
import { Form, useAction } from "@workshop/nano-remix"
import { useEffect, useRef } from "hono/jsx"
import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime"
import { ensure } from "@workshop/shared/utils"
export const action = async (request: Request) => {
const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-realtime-preview-2025-06-03",
}),
})
const { client_secret } = await response.json()
return { secret: client_secret?.value }
}
export default function Voice() {
const { data, loading, error } = useAction<typeof action>()
const session = useRef<RealtimeSession | undefined>(undefined)
useEffect(() => {
if (!data?.secret) return
if (session.current) return
session.current = createSession()
session.current.connect({ apiKey: data.secret })
}, [data?.secret])
return (
<div>
{error && <p>Error: {error}</p>}
<p>Ephemeral Key: {loading ? "Loading..." : data?.secret}</p>
<Form name="voiceForm">
<button type="submit">Start Voice Session</button>
</Form>
</div>
)
}
const createSession = () => {
const agent = new RealtimeAgent({
name: "Assistant",
voice: "echo",
instructions: `
You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things.
# Voice Tone
You have a very quiet and have a slight accent that is hard to place.
`,
})
const session = new RealtimeSession(agent)
session.on("error", (error) => {
console.error("Session error:", error)
})
return session
}

View File

@ -43,9 +43,8 @@ export default function Camera() {
const canvas = canvasRef.current
const video = videoRef.current
// Downscale to max 320x240
const maxWidth = 320
const maxHeight = 240
const maxWidth = 1000
const maxHeight = 1000
const aspectRatio = video.videoWidth / video.videoHeight
let newWidth = maxWidth

View File

@ -0,0 +1,62 @@
import { useEffect, useRef, useState } from "hono/jsx"
import { StreamingResponse } from "../streamingAI"
export default function Voice() {
const [audioError, setAudioError] = useState<string>("")
const [transcript, setTranscript] = useState<string>("")
const [isRecording, setIsRecording] = useState(false)
const streamingResponseRef = useRef<StreamingResponse>(null)
const startRecording = async () => {
setAudioError("")
setTranscript("")
streamingResponseRef.current = new StreamingResponse((error) => setAudioError(error))
await streamingResponseRef.current.start()
setIsRecording(true)
}
const endRecording = async () => {
setIsRecording(false)
try {
const reader = await streamingResponseRef.current!.stop()
const decoder = new TextDecoder()
while (true) {
const { done, value } = await reader.read()
if (done) break
const chunk = decoder.decode(value, { stream: true })
setTranscript((prev) => prev + chunk)
}
} catch (error) {
console.error("Error during streaming:", error)
setAudioError(`Streaming failed: ${error}`)
}
}
useEffect(() => {
return () => endRecording()
}, [])
return (
<div>
{audioError && <p>Audio Error: {audioError}</p>}
<div>
<h3>Audio Recording</h3>
<button onClick={isRecording ? endRecording : startRecording}>
{isRecording ? "Stop Recording" : "Start Recording"}
</button>
{isRecording && <p>🎤 Recording...</p>}
</div>
{transcript && (
<div>
<h4>Transcript:</h4>
<p>{transcript}</p>
</div>
)}
</div>
)
}

View File

@ -1,4 +1,8 @@
import { nanoRemix } from "@workshop/nano-remix"
import { OpenAI } from "openai"
import { Agent, run, type AgentInputItem } from "@openai/agents"
import fs from "node:fs"
import { getErrorMessage } from "@workshop/shared/errors"
Bun.serve({
port: 3000,
@ -8,8 +12,74 @@ Bun.serve({
cert: Bun.file("certs/cert.pem"),
},
routes: {
"/api/streamResponse": async (req) => {
try {
return streamResponse(req)
} catch (error) {
console.error("Transcription error:", error)
return new Response(`Transcription failed: ${getErrorMessage(error)}`, { status: 500 })
}
},
"/*": (req) => {
return nanoRemix(req)
},
},
})
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
})
const streamResponse = async (req: Request) => {
const transcript = await transcribeAudio(req)
const agent = new Agent({
name: "Whiteboard Assistant",
model: "gpt-4o",
instructions: "You are a helpful assistant that talks about a whiteboard.",
})
const imagePath = "public/whiteboard.png"
const base64Image = fs.readFileSync(imagePath, "base64")
const input: AgentInputItem[] = [
{
role: "user",
type: "message",
content: [
{ type: "input_image", image: `data:image/png;base64,${base64Image}` },
{ type: "input_text", text: transcript },
],
},
]
const result = await run(agent, input, { stream: true })
const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any
return new Response(readableStream, {
headers: {
"Content-Type": "text/plain",
"Cache-Control": "no-cache",
Connection: "keep-alive",
},
})
}
const transcribeAudio = async (req: Request) => {
if (req.method !== "POST") {
throw new Error("Method not allowed, only POST is supported")
} else if (!req.body) {
throw new Error("No audio data provided")
}
const response = new Response(req.body)
const audioBlob = await response.blob()
const audioFile = new File([audioBlob], "audio.webm", { type: "audio/webm" })
const transcript = await openai.audio.transcriptions.create({
file: audioFile,
model: "gpt-4o-mini-transcribe",
response_format: "text",
})
return transcript
}

View File

@ -0,0 +1,78 @@
import { getErrorMessage } from "@workshop/shared/errors"
export class StreamingResponse {
private mediaRecorder?: MediaRecorder
private mediaStream?: MediaStream
private audioChunks: Blob[] = []
private isRecording = false
constructor(private onError: (error: string) => void) {}
async start() {
try {
if (this.isRecording) return
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 16000 },
})
this.mediaRecorder = new MediaRecorder(this.mediaStream, { mimeType: "audio/webm" })
this.audioChunks = []
this.mediaRecorder.addEventListener("dataavailable", (event) => {
if (event.data.size === 0) return
console.log("Audio chunk received:", event.data.size, "bytes")
this.audioChunks.push(event.data)
})
this.mediaRecorder.addEventListener("stop", async () => {
this.mediaStream?.getTracks().forEach((track) => track.stop())
this.isRecording = false
})
this.mediaRecorder.start(1000)
this.isRecording = true
} catch (error) {
this.mediaRecorder?.stop()
console.error("Error starting recording:", error)
this.onError(`Failed to start recording: ${getErrorMessage(error)}`)
}
}
async stop() {
return new Promise<ReadableStreamDefaultReader<Uint8Array>>((resolve, reject) => {
if (!this.mediaRecorder || !this.isRecording) {
reject("No media recorder is active")
return
}
this.mediaRecorder.addEventListener("stop", async () => {
try {
const audioBlob = new Blob(this.audioChunks, { type: "audio/webm" })
const stream = await this.streamResponse(audioBlob)
resolve(stream)
} catch (error) {
reject(`Failed to process audio stream: ${getErrorMessage(error)}`)
}
})
this.mediaRecorder.stop()
})
}
private async streamResponse(audioBlob: Blob) {
const response = await fetch("/api/streamResponse", { method: "POST", body: audioBlob })
if (!response.ok) {
const errorText = await response.text()
throw new Error(`Server error: ${response.status} - ${errorText}`)
}
return response.body!.getReader()
}
getIsRecording() {
return this.isRecording
}
}