it is working
This commit is contained in:
parent
fe247d7eb1
commit
0721f2ead1
|
|
@ -115,7 +115,6 @@ export const Form = (props: FormProps) => {
|
|||
} else if (res.ok) {
|
||||
const { actionData, loaderData } = (await res.json()) as any
|
||||
window._setLoaderData!(loaderData)
|
||||
|
||||
actionFns.setData(actionData)
|
||||
} else {
|
||||
const errorText = await res.text()
|
||||
|
|
|
|||
BIN
packages/whiteboard/public/multi-color.png
Normal file
BIN
packages/whiteboard/public/multi-color.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 897 KiB |
BIN
packages/whiteboard/public/red.png
Normal file
BIN
packages/whiteboard/public/red.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 892 KiB |
BIN
packages/whiteboard/public/rough.png
Normal file
BIN
packages/whiteboard/public/rough.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 890 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 79 KiB After Width: | Height: | Size: 897 KiB |
23
packages/whiteboard/src/agent.ts
Normal file
23
packages/whiteboard/src/agent.ts
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import { tool, RealtimeAgent } from "@openai/agents/realtime"
|
||||
import { run } from "@openai/agents"
|
||||
|
||||
// 1. Define a tool to fetch the latest whiteboard image
|
||||
const fetchWhiteboard = tool({
|
||||
name: "fetchWhiteboard",
|
||||
description: "Fetch the latest whiteboard image and return its bytes",
|
||||
parameters: undefined,
|
||||
execute: async () => {
|
||||
return await Bun.file("public/whiteboard.png").arrayBuffer()
|
||||
},
|
||||
})
|
||||
|
||||
async function main() {
|
||||
const agent = new RealtimeAgent({
|
||||
name: "Spike",
|
||||
instructions: "When asked to analyze the whiteboard, call fetchWhiteboard",
|
||||
tools: [fetchWhiteboard],
|
||||
})
|
||||
|
||||
const result = await run(agent, "Hey Spike, analyze the whiteboard.")
|
||||
console.log("Agent response:", result.finalOutput)
|
||||
}
|
||||
|
|
@ -11,14 +11,14 @@ type Element = {
|
|||
type StructuredResponse = { elements: Element[] }
|
||||
|
||||
export const detectShapes = async (
|
||||
imgBuffer: ArrayBuffer,
|
||||
minAreaPercent = 0.5,
|
||||
maxAreaPercent = 15
|
||||
imageBuffer: ArrayBuffer,
|
||||
minAreaPercent = 5,
|
||||
maxAreaPercent = 33
|
||||
): Promise<StructuredResponse> => {
|
||||
const cv = await cvReady
|
||||
|
||||
// 1. Decode PNG from ArrayBuffer → raw RGBA buffer
|
||||
const buf = Buffer.from(imgBuffer)
|
||||
// 1. Load & decode PNG → raw RGBA buffer
|
||||
const buf = Buffer.from(imageBuffer)
|
||||
const { width, height, data } = PNG.sync.read(buf)
|
||||
|
||||
// 2. Create a 4-ch Mat from RGBA pixels
|
||||
|
|
@ -31,6 +31,20 @@ export const detectShapes = async (
|
|||
|
||||
const thresh = new cv.Mat()
|
||||
cv.adaptiveThreshold(gray, thresh, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 11, 2)
|
||||
// Morphological opening to remove small noise
|
||||
const removeNoise = (mat: cvReady.Mat, kSize = 3) => {
|
||||
const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
|
||||
cv.morphologyEx(mat, mat, cv.MORPH_OPEN, kernel)
|
||||
kernel.delete()
|
||||
}
|
||||
// Morphological closing to bridge gaps in contours
|
||||
const closeGaps = (mat: cvReady.Mat, kSize = 7) => {
|
||||
const kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(kSize, kSize))
|
||||
cv.morphologyEx(mat, mat, cv.MORPH_CLOSE, kernel)
|
||||
kernel.delete()
|
||||
}
|
||||
removeNoise(thresh, 3)
|
||||
closeGaps(thresh, 7)
|
||||
|
||||
// 4. Find contours
|
||||
const contours = new cv.MatVector()
|
||||
|
|
@ -44,15 +58,31 @@ export const detectShapes = async (
|
|||
for (let i = 0; i < contours.size(); i++) {
|
||||
const cnt = contours.get(i)
|
||||
const rect = cv.boundingRect(cnt)
|
||||
const contourArea = cv.contourArea(cnt)
|
||||
const areaPercent = (contourArea / totalImageArea) * 100
|
||||
|
||||
// Basic filtering
|
||||
if (areaPercent < minAreaPercent || areaPercent > maxAreaPercent) {
|
||||
// Skip shapes whose bounding box touches the image border
|
||||
if (rect.x === 0 || rect.y === 0 || rect.x + rect.width === width || rect.y + rect.height === height) {
|
||||
// console.log(
|
||||
// `-- skip: boundingRect touches border rect=(${rect.x},${rect.y},${rect.width},${rect.height})`
|
||||
// )
|
||||
cnt.delete()
|
||||
continue
|
||||
}
|
||||
|
||||
// Calculate area based on bounding box
|
||||
const rectArea = rect.width * rect.height
|
||||
const areaPercent = (rectArea / totalImageArea) * 100
|
||||
|
||||
// Basic filtering (lower bound only; upper bound filter disabled)
|
||||
if (areaPercent < minAreaPercent) {
|
||||
// cnt.delete()
|
||||
continue
|
||||
} else if (areaPercent > maxAreaPercent) {
|
||||
// cnt.delete()
|
||||
continue
|
||||
}
|
||||
// console.log(`-- upper bound filter disabled (areaPercent=${areaPercent.toFixed(2)} > maxAreaPercent=${maxAreaPercent})`)
|
||||
|
||||
/*
|
||||
const margin = Math.min(width, height) * 0.05
|
||||
if (
|
||||
rect.x < margin ||
|
||||
|
|
@ -60,9 +90,10 @@ export const detectShapes = async (
|
|||
rect.x + rect.width > width - margin ||
|
||||
rect.y + rect.height > height - margin
|
||||
) {
|
||||
cnt.delete()
|
||||
// cnt.delete()
|
||||
continue
|
||||
}
|
||||
*/
|
||||
|
||||
// Simple shape classification
|
||||
const peri = cv.arcLength(cnt, true)
|
||||
|
|
@ -84,6 +115,12 @@ export const detectShapes = async (
|
|||
label,
|
||||
})
|
||||
|
||||
console.log(
|
||||
`-- accepted shape #${i}: ${label} (${rect.x},${rect.y},${rect.width},${
|
||||
rect.height
|
||||
}) area=${areaPercent.toFixed(2)}%`
|
||||
)
|
||||
|
||||
cnt.delete()
|
||||
approx.delete()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,22 +5,26 @@ import { getGeminiResponse } from "../ai"
|
|||
import result from "../result.json"
|
||||
import { detectShapes } from "../opencv"
|
||||
|
||||
const categories = ["hand drawn circle", "hand drawn square", "hand drawn arrow"]
|
||||
const categories = [
|
||||
"hand drawn circle",
|
||||
"hand drawn square",
|
||||
"hand drawn arrow",
|
||||
"hand drawn triangle",
|
||||
"hand drawn rectangle",
|
||||
"hand drawn polygon",
|
||||
]
|
||||
const prompts = {
|
||||
default: `Detect all of the of the following objects: ${categories}. The box_2d should be an object with ymin, xmin, ymax, xmax properties normalized to 0-1000.`,
|
||||
simple: `Detect the 2d bounding boxes of the following objects: ${categories}.`,
|
||||
}
|
||||
|
||||
export const action = async (req: Request, params: {}) => {
|
||||
const url = new URL(req.url)
|
||||
const imageUrl = new URL("whiteboard.png", url.origin).toString()
|
||||
const imageResponse = await fetch(imageUrl)
|
||||
const imageBuffer = await imageResponse.arrayBuffer()
|
||||
// const response = await getGeminiResponse(imageBuffer, prompts.default)
|
||||
const imageBuffer = await Bun.file("public/whiteboard.png").arrayBuffer()
|
||||
const response = await getGeminiResponse(imageBuffer, prompts.default)
|
||||
// return { elements: response?.elements || [] }
|
||||
|
||||
const response = await detectShapes(imageBuffer)
|
||||
return { elements: response.elements }
|
||||
// const response = await detectShapes(imageBuffer)
|
||||
return { elements: response!.elements }
|
||||
}
|
||||
|
||||
export default function Index() {
|
||||
|
|
|
|||
64
packages/whiteboard/src/routes/realtime.tsx
Normal file
64
packages/whiteboard/src/routes/realtime.tsx
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import { Form, useAction } from "@workshop/nano-remix"
|
||||
import { useEffect, useRef } from "hono/jsx"
|
||||
import { RealtimeAgent, RealtimeSession } from "@openai/agents/realtime"
|
||||
import { ensure } from "@workshop/shared/utils"
|
||||
|
||||
export const action = async (request: Request) => {
|
||||
const response = await fetch("https://api.openai.com/v1/realtime/sessions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "gpt-4o-realtime-preview-2025-06-03",
|
||||
}),
|
||||
})
|
||||
const { client_secret } = await response.json()
|
||||
|
||||
return { secret: client_secret?.value }
|
||||
}
|
||||
|
||||
export default function Voice() {
|
||||
const { data, loading, error } = useAction<typeof action>()
|
||||
const session = useRef<RealtimeSession | undefined>(undefined)
|
||||
|
||||
useEffect(() => {
|
||||
if (!data?.secret) return
|
||||
if (session.current) return
|
||||
|
||||
session.current = createSession()
|
||||
session.current.connect({ apiKey: data.secret })
|
||||
}, [data?.secret])
|
||||
|
||||
return (
|
||||
<div>
|
||||
{error && <p>Error: {error}</p>}
|
||||
<p>Ephemeral Key: {loading ? "Loading..." : data?.secret}</p>
|
||||
<Form name="voiceForm">
|
||||
<button type="submit">Start Voice Session</button>
|
||||
</Form>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
const createSession = () => {
|
||||
const agent = new RealtimeAgent({
|
||||
name: "Assistant",
|
||||
voice: "echo",
|
||||
instructions: `
|
||||
You are Spike, you are helping Corey at the whiteboard. Every question he asks will include a screenshot of the whiteboard. Sometimes his questions will be about the whiteboard, sometimes they will be about other things.
|
||||
|
||||
# Voice Tone
|
||||
|
||||
You have a very quiet and have a slight accent that is hard to place.
|
||||
|
||||
`,
|
||||
})
|
||||
const session = new RealtimeSession(agent)
|
||||
session.on("error", (error) => {
|
||||
console.error("Session error:", error)
|
||||
})
|
||||
|
||||
return session
|
||||
}
|
||||
|
|
@ -43,9 +43,8 @@ export default function Camera() {
|
|||
const canvas = canvasRef.current
|
||||
const video = videoRef.current
|
||||
|
||||
// Downscale to max 320x240
|
||||
const maxWidth = 320
|
||||
const maxHeight = 240
|
||||
const maxWidth = 1000
|
||||
const maxHeight = 1000
|
||||
const aspectRatio = video.videoWidth / video.videoHeight
|
||||
|
||||
let newWidth = maxWidth
|
||||
|
|
|
|||
62
packages/whiteboard/src/routes/voice.tsx
Normal file
62
packages/whiteboard/src/routes/voice.tsx
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
import { useEffect, useRef, useState } from "hono/jsx"
|
||||
import { StreamingResponse } from "../streamingAI"
|
||||
|
||||
export default function Voice() {
|
||||
const [audioError, setAudioError] = useState<string>("")
|
||||
const [transcript, setTranscript] = useState<string>("")
|
||||
const [isRecording, setIsRecording] = useState(false)
|
||||
const streamingResponseRef = useRef<StreamingResponse>(null)
|
||||
|
||||
const startRecording = async () => {
|
||||
setAudioError("")
|
||||
setTranscript("")
|
||||
streamingResponseRef.current = new StreamingResponse((error) => setAudioError(error))
|
||||
await streamingResponseRef.current.start()
|
||||
setIsRecording(true)
|
||||
}
|
||||
|
||||
const endRecording = async () => {
|
||||
setIsRecording(false)
|
||||
try {
|
||||
const reader = await streamingResponseRef.current!.stop()
|
||||
const decoder = new TextDecoder()
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read()
|
||||
if (done) break
|
||||
|
||||
const chunk = decoder.decode(value, { stream: true })
|
||||
setTranscript((prev) => prev + chunk)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error during streaming:", error)
|
||||
setAudioError(`Streaming failed: ${error}`)
|
||||
}
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
return () => endRecording()
|
||||
}, [])
|
||||
|
||||
return (
|
||||
<div>
|
||||
{audioError && <p>Audio Error: {audioError}</p>}
|
||||
|
||||
<div>
|
||||
<h3>Audio Recording</h3>
|
||||
<button onClick={isRecording ? endRecording : startRecording}>
|
||||
{isRecording ? "Stop Recording" : "Start Recording"}
|
||||
</button>
|
||||
|
||||
{isRecording && <p>🎤 Recording...</p>}
|
||||
</div>
|
||||
|
||||
{transcript && (
|
||||
<div>
|
||||
<h4>Transcript:</h4>
|
||||
<p>{transcript}</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
|
@ -1,4 +1,8 @@
|
|||
import { nanoRemix } from "@workshop/nano-remix"
|
||||
import { OpenAI } from "openai"
|
||||
import { Agent, run, type AgentInputItem } from "@openai/agents"
|
||||
import fs from "node:fs"
|
||||
import { getErrorMessage } from "@workshop/shared/errors"
|
||||
|
||||
Bun.serve({
|
||||
port: 3000,
|
||||
|
|
@ -8,8 +12,74 @@ Bun.serve({
|
|||
cert: Bun.file("certs/cert.pem"),
|
||||
},
|
||||
routes: {
|
||||
"/api/streamResponse": async (req) => {
|
||||
try {
|
||||
return streamResponse(req)
|
||||
} catch (error) {
|
||||
console.error("Transcription error:", error)
|
||||
return new Response(`Transcription failed: ${getErrorMessage(error)}`, { status: 500 })
|
||||
}
|
||||
},
|
||||
"/*": (req) => {
|
||||
return nanoRemix(req)
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
})
|
||||
|
||||
const streamResponse = async (req: Request) => {
|
||||
const transcript = await transcribeAudio(req)
|
||||
|
||||
const agent = new Agent({
|
||||
name: "Whiteboard Assistant",
|
||||
model: "gpt-4o",
|
||||
instructions: "You are a helpful assistant that talks about a whiteboard.",
|
||||
})
|
||||
|
||||
const imagePath = "public/whiteboard.png"
|
||||
const base64Image = fs.readFileSync(imagePath, "base64")
|
||||
const input: AgentInputItem[] = [
|
||||
{
|
||||
role: "user",
|
||||
type: "message",
|
||||
content: [
|
||||
{ type: "input_image", image: `data:image/png;base64,${base64Image}` },
|
||||
{ type: "input_text", text: transcript },
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
const result = await run(agent, input, { stream: true })
|
||||
const readableStream = result.toTextStream() as any // This DOES work, but typescript is a little confused so I cast it to any
|
||||
|
||||
return new Response(readableStream, {
|
||||
headers: {
|
||||
"Content-Type": "text/plain",
|
||||
"Cache-Control": "no-cache",
|
||||
Connection: "keep-alive",
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
const transcribeAudio = async (req: Request) => {
|
||||
if (req.method !== "POST") {
|
||||
throw new Error("Method not allowed, only POST is supported")
|
||||
} else if (!req.body) {
|
||||
throw new Error("No audio data provided")
|
||||
}
|
||||
|
||||
const response = new Response(req.body)
|
||||
const audioBlob = await response.blob()
|
||||
const audioFile = new File([audioBlob], "audio.webm", { type: "audio/webm" })
|
||||
|
||||
const transcript = await openai.audio.transcriptions.create({
|
||||
file: audioFile,
|
||||
model: "gpt-4o-mini-transcribe",
|
||||
response_format: "text",
|
||||
})
|
||||
|
||||
return transcript
|
||||
}
|
||||
|
|
|
|||
78
packages/whiteboard/src/streamingAI.ts
Normal file
78
packages/whiteboard/src/streamingAI.ts
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
import { getErrorMessage } from "@workshop/shared/errors"
|
||||
|
||||
export class StreamingResponse {
|
||||
private mediaRecorder?: MediaRecorder
|
||||
private mediaStream?: MediaStream
|
||||
private audioChunks: Blob[] = []
|
||||
private isRecording = false
|
||||
|
||||
constructor(private onError: (error: string) => void) {}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
if (this.isRecording) return
|
||||
|
||||
this.mediaStream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 16000 },
|
||||
})
|
||||
|
||||
this.mediaRecorder = new MediaRecorder(this.mediaStream, { mimeType: "audio/webm" })
|
||||
this.audioChunks = []
|
||||
|
||||
this.mediaRecorder.addEventListener("dataavailable", (event) => {
|
||||
if (event.data.size === 0) return
|
||||
console.log("Audio chunk received:", event.data.size, "bytes")
|
||||
this.audioChunks.push(event.data)
|
||||
})
|
||||
|
||||
this.mediaRecorder.addEventListener("stop", async () => {
|
||||
this.mediaStream?.getTracks().forEach((track) => track.stop())
|
||||
this.isRecording = false
|
||||
})
|
||||
|
||||
this.mediaRecorder.start(1000)
|
||||
this.isRecording = true
|
||||
} catch (error) {
|
||||
this.mediaRecorder?.stop()
|
||||
|
||||
console.error("Error starting recording:", error)
|
||||
this.onError(`Failed to start recording: ${getErrorMessage(error)}`)
|
||||
}
|
||||
}
|
||||
|
||||
async stop() {
|
||||
return new Promise<ReadableStreamDefaultReader<Uint8Array>>((resolve, reject) => {
|
||||
if (!this.mediaRecorder || !this.isRecording) {
|
||||
reject("No media recorder is active")
|
||||
return
|
||||
}
|
||||
|
||||
this.mediaRecorder.addEventListener("stop", async () => {
|
||||
try {
|
||||
const audioBlob = new Blob(this.audioChunks, { type: "audio/webm" })
|
||||
const stream = await this.streamResponse(audioBlob)
|
||||
resolve(stream)
|
||||
} catch (error) {
|
||||
reject(`Failed to process audio stream: ${getErrorMessage(error)}`)
|
||||
}
|
||||
})
|
||||
|
||||
this.mediaRecorder.stop()
|
||||
})
|
||||
}
|
||||
|
||||
private async streamResponse(audioBlob: Blob) {
|
||||
const response = await fetch("/api/streamResponse", { method: "POST", body: audioBlob })
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text()
|
||||
throw new Error(`Server error: ${response.status} - ${errorText}`)
|
||||
}
|
||||
|
||||
return response.body!.getReader()
|
||||
}
|
||||
|
||||
getIsRecording() {
|
||||
return this.isRecording
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user