freddyaboulton HF staff commited on
Commit
0704e01
·
verified ·
1 Parent(s): 3838cf4

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +8 -5
  2. app.py +146 -0
  3. index.html +222 -0
  4. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: Talk To Openai
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.16.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Talk to OpenAI
3
+ emoji: 🗣️
4
+ colorFrom: purple
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.16.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Talk to OpenAI using their multimodal API
12
+ tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|OPENAI_API_KEY]
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+ from gradio.utils import get_space
7
+ import numpy as np
8
+ import json
9
+ import openai
10
+ from dotenv import load_dotenv
11
+ from fastrtc import (
12
+ AdditionalOutputs,
13
+ AsyncStreamHandler,
14
+ Stream,
15
+ get_twilio_turn_credentials,
16
+ )
17
+ from fastapi.responses import HTMLResponse, StreamingResponse
18
+
19
+ from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
20
+
21
+ load_dotenv()
22
+
23
+ cur_dir = Path(__file__).parent
24
+
25
+ SAMPLE_RATE = 24000
26
+
27
+
28
+ class OpenAIHandler(AsyncStreamHandler):
29
+ def __init__(
30
+ self,
31
+ ) -> None:
32
+ super().__init__(
33
+ expected_layout="mono",
34
+ output_sample_rate=SAMPLE_RATE,
35
+ output_frame_size=480,
36
+ input_sample_rate=SAMPLE_RATE,
37
+ )
38
+ self.connection = None
39
+ self.connected = asyncio.Event()
40
+ self.output_queue = asyncio.Queue()
41
+
42
+ def copy(self):
43
+ return OpenAIHandler()
44
+
45
+ async def _initialize_connection(
46
+ self,
47
+ ):
48
+ """Connect to realtime API. Run forever in separate thread to keep connection open."""
49
+ self.client = openai.AsyncOpenAI()
50
+ async with self.client.beta.realtime.connect(
51
+ model="gpt-4o-mini-realtime-preview-2024-12-17"
52
+ ) as conn:
53
+ await conn.session.update(
54
+ session={"turn_detection": {"type": "server_vad"}}
55
+ )
56
+ self.connection = conn
57
+ self.connected.set()
58
+ async for event in self.connection:
59
+ if event.type == "response.audio_transcript.done":
60
+ await self.output_queue.put(AdditionalOutputs(event))
61
+ if event.type == "response.audio.delta":
62
+ await self.output_queue.put(
63
+ (
64
+ self.output_sample_rate,
65
+ np.frombuffer(
66
+ base64.b64decode(event.delta), dtype=np.int16
67
+ ).reshape(1, -1),
68
+ ),
69
+ )
70
+
71
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
72
+ if not self.connection:
73
+ await self.fetch_args()
74
+ asyncio.create_task(self._initialize_connection())
75
+ await self.connected.wait()
76
+ try:
77
+ _, array = frame
78
+ array = array.squeeze()
79
+ audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
80
+ await self.connection.input_audio_buffer.append(audio=audio_message) # type: ignore
81
+ except Exception as e:
82
+ # print traceback
83
+ print(f"Error in receive: {str(e)}")
84
+ import traceback
85
+
86
+ traceback.print_exc()
87
+
88
+ async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
89
+ if not self.connection:
90
+ return None
91
+ return await self.output_queue.get()
92
+
93
+ def reset_state(self):
94
+ """Reset connection state for new recording session"""
95
+ self.connection = None
96
+ self.args_set.clear()
97
+ self.connected.clear()
98
+
99
+ async def shutdown(self) -> None:
100
+ if self.connection:
101
+ await self.connection.close()
102
+ self.reset_state()
103
+
104
+
105
+ def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
106
+ chatbot.append({"role": "assistant", "content": response.transcript})
107
+ return chatbot
108
+
109
+
110
+ chatbot = gr.Chatbot(type="messages")
111
+ latest_message = gr.Textbox(type="text", visible=False)
112
+ stream = Stream(
113
+ OpenAIHandler(),
114
+ mode="send-receive",
115
+ modality="audio",
116
+ additional_inputs=[chatbot],
117
+ additional_outputs=[chatbot],
118
+ additional_outputs_handler=update_chatbot,
119
+ rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
120
+ )
121
+
122
+
123
+ @stream.get("/")
124
+ async def _():
125
+ rtc_config = get_twilio_turn_credentials() if get_space() else None
126
+ html_content = (cur_dir / "index.html").read_text()
127
+ html_content = html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
128
+ return HTMLResponse(content=html_content)
129
+
130
+
131
+ @stream.get("/outputs")
132
+ def _(webrtc_id: str):
133
+ async def output_stream():
134
+ import json
135
+
136
+ async for output in stream.output_stream(webrtc_id):
137
+ s = json.dumps({"role": "assistant", "content": output.args[0].transcript})
138
+ yield f"event: output\ndata: {s}\n\n"
139
+
140
+ return StreamingResponse(output_stream(), media_type="text/event-stream")
141
+
142
+
143
+ if __name__ == "__main__":
144
+ import uvicorn
145
+
146
+ uvicorn.run(stream, host="0.0.0.0", port=7860)
index.html ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>OpenAI Real-Time Chat</title>
8
+ <style>
9
+ body {
10
+ font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, sans-serif;
11
+ background-color: #0a0a0a;
12
+ color: #ffffff;
13
+ margin: 0;
14
+ padding: 20px;
15
+ height: 100vh;
16
+ box-sizing: border-box;
17
+ }
18
+
19
+ .container {
20
+ max-width: 800px;
21
+ margin: 0 auto;
22
+ height: calc(100% - 100px);
23
+ }
24
+
25
+ .logo {
26
+ text-align: center;
27
+ margin-bottom: 40px;
28
+ }
29
+
30
+ .chat-container {
31
+ border: 1px solid #333;
32
+ padding: 20px;
33
+ height: 90%;
34
+ box-sizing: border-box;
35
+ display: flex;
36
+ flex-direction: column;
37
+ }
38
+
39
+ .chat-messages {
40
+ flex-grow: 1;
41
+ overflow-y: auto;
42
+ margin-bottom: 20px;
43
+ padding: 10px;
44
+ }
45
+
46
+ .message {
47
+ margin-bottom: 20px;
48
+ padding: 12px;
49
+ border-radius: 4px;
50
+ font-size: 16px;
51
+ line-height: 1.5;
52
+ }
53
+
54
+ .message.user {
55
+ background-color: #1a1a1a;
56
+ margin-left: 20%;
57
+ }
58
+
59
+ .message.assistant {
60
+ background-color: #262626;
61
+ margin-right: 20%;
62
+ }
63
+
64
+ .controls {
65
+ text-align: center;
66
+ margin-top: 20px;
67
+ }
68
+
69
+ button {
70
+ background-color: transparent;
71
+ color: #ffffff;
72
+ border: 1px solid #ffffff;
73
+ padding: 12px 24px;
74
+ font-family: inherit;
75
+ font-size: 16px;
76
+ cursor: pointer;
77
+ transition: all 0.3s;
78
+ text-transform: uppercase;
79
+ letter-spacing: 1px;
80
+ }
81
+
82
+ button:hover {
83
+ background-color: #ffffff;
84
+ color: #0a0a0a;
85
+ }
86
+
87
+ #audio-output {
88
+ display: none;
89
+ }
90
+ </style>
91
+ </head>
92
+
93
+ <body>
94
+ <div class="container">
95
+ <div class="logo">
96
+ <h1>OpenAI Real-Time Chat</h1>
97
+ </div>
98
+ <div class="chat-container">
99
+ <div class="chat-messages" id="chat-messages"></div>
100
+ </div>
101
+ <div class="controls">
102
+ <button id="start-button">Start Conversation</button>
103
+ </div>
104
+ </div>
105
+ <audio id="audio-output"></audio>
106
+
107
+ <script>
108
+ let peerConnection;
109
+ let webrtc_id;
110
+ const audioOutput = document.getElementById('audio-output');
111
+ const startButton = document.getElementById('start-button');
112
+ const chatMessages = document.getElementById('chat-messages');
113
+
114
+ async function setupWebRTC() {
115
+ const config = __RTC_CONFIGURATION__;
116
+
117
+ peerConnection = new RTCPeerConnection();
118
+
119
+ try {
120
+ const stream = await navigator.mediaDevices.getUserMedia({
121
+ audio: true
122
+ });
123
+
124
+ stream.getTracks().forEach(track => {
125
+ peerConnection.addTrack(track, stream);
126
+ });
127
+
128
+ peerConnection.addEventListener('track', (evt) => {
129
+ if (audioOutput.srcObject !== evt.streams[0]) {
130
+ audioOutput.srcObject = evt.streams[0];
131
+ audioOutput.play();
132
+ }
133
+ });
134
+
135
+ const dataChannel = peerConnection.createDataChannel('text');
136
+
137
+ const offer = await peerConnection.createOffer();
138
+ await peerConnection.setLocalDescription(offer);
139
+
140
+ await new Promise((resolve) => {
141
+ if (peerConnection.iceGatheringState === "complete") {
142
+ resolve();
143
+ } else {
144
+ const checkState = () => {
145
+ if (peerConnection.iceGatheringState === "complete") {
146
+ peerConnection.removeEventListener("icegatheringstatechange", checkState);
147
+ resolve();
148
+ }
149
+ };
150
+ peerConnection.addEventListener("icegatheringstatechange", checkState);
151
+ }
152
+ });
153
+
154
+ webrtc_id = Math.random().toString(36).substring(7);
155
+
156
+ const response = await fetch('/webrtc/offer', {
157
+ method: 'POST',
158
+ headers: { 'Content-Type': 'application/json' },
159
+ body: JSON.stringify({
160
+ sdp: offer.sdp,
161
+ type: offer.type,
162
+ webrtc_id: webrtc_id
163
+ })
164
+ });
165
+
166
+ const serverResponse = await response.json();
167
+ await peerConnection.setRemoteDescription(serverResponse);
168
+
169
+ const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
170
+ eventSource.addEventListener("output", (event) => {
171
+ const eventJson = JSON.parse(event.data);
172
+ addMessage("assistant", eventJson.content);
173
+
174
+ });
175
+ } catch (err) {
176
+ console.error('Error setting up WebRTC:', err);
177
+ }
178
+ }
179
+
180
+ function addMessage(role, content) {
181
+ const messageDiv = document.createElement('div');
182
+ messageDiv.classList.add('message', role);
183
+ messageDiv.textContent = content;
184
+ chatMessages.appendChild(messageDiv);
185
+ chatMessages.scrollTop = chatMessages.scrollHeight;
186
+ }
187
+
188
+ function stop() {
189
+ if (peerConnection) {
190
+ if (peerConnection.getTransceivers) {
191
+ peerConnection.getTransceivers().forEach(transceiver => {
192
+ if (transceiver.stop) {
193
+ transceiver.stop();
194
+ }
195
+ });
196
+ }
197
+
198
+ if (peerConnection.getSenders) {
199
+ peerConnection.getSenders().forEach(sender => {
200
+ if (sender.track && sender.track.stop) sender.track.stop();
201
+ });
202
+ }
203
+
204
+ setTimeout(() => {
205
+ peerConnection.close();
206
+ }, 500);
207
+ }
208
+ }
209
+
210
+ startButton.addEventListener('click', () => {
211
+ if (startButton.textContent === 'Start Conversation') {
212
+ setupWebRTC();
213
+ startButton.textContent = 'Stop Conversation';
214
+ } else {
215
+ stop();
216
+ startButton.textContent = 'Start Conversation';
217
+ }
218
+ });
219
+ </script>
220
+ </body>
221
+
222
+ </html>
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastrtc[vad]==0.0.32rc1
2
+ openai
3
+ twilio
4
+ python-dotenv