Voice Assistant
Build voice-enabled applications on the Jetson Orin Nano Field Kit
Introduction
The Field Kit's audio capabilities make it perfect for building voice assistants, speech recognition systems, and audio processing applications. This guide covers everything from basic audio capture to building sophisticated voice interfaces.
Audio Hardware Setup
Waveshare Audio Card
If your Field Kit includes the Waveshare audio expansion card:
Verify audio devices:
# List audio playback devices
aplay -l
# List audio capture devices
arecord -l
# Test audio output
speaker-test -t wav -c 2
# Test audio input
arecord -d 5 -f cd /tmp/test.wav
aplay /tmp/test.wavUSB Audio Devices
For USB microphones or audio interfaces:
# List USB audio devices
lsusb | grep -i audio
# Check if device is recognized
arecord -lBasic Audio Capture
Using ALSA
import sounddevice as sd
import numpy as np
# Record audio
duration = 5 # seconds
sample_rate = 44100
print("Recording...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2)
sd.wait()
print("Recording complete")
# Save to file
import scipy.io.wavfile as wavfile
wavfile.write('/tmp/recording.wav', sample_rate, audio)Using PyAudio
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
OUTPUT_FILENAME = "/tmp/output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("Recording...")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("Recording complete")
stream.stop_stream()
stream.close()
p.terminate()
# Save to file
wf = wave.open(OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()Speech Recognition
Using whisper.cpp (Pre-installed)
The Field Kit includes whisper.cpp with CUDA support pre-installed:
# Check if whisper.cpp is installed
which whisper-cli
# Download a model (if not already downloaded)
cd ~/Workspace/whisper.cpp
bash ./models/download-ggml-model.sh base.en
# Run transcription
whisper-cli -m models/ggml-base.en.bin -f /path/to/audio.wavUsing Faster Whisper (Python)
The voice assistant uses faster-whisper for real-time transcription:
pip3 install faster-whisperUsing OpenAI Whisper (Python)
Installation:
pip3 install openai-whisperBasic usage:
import whisper
# Load model (choose based on your needs: tiny, base, small, medium, large)
model = whisper.load_model("base")
# Transcribe audio file
result = model.transcribe("/tmp/recording.wav")
print(result["text"])
# Transcribe with options
result = model.transcribe(
"/tmp/recording.wav",
language="en",
task="transcribe",
fp16=False # Use fp32 on Jetson for better compatibility
)Real-time transcription:
import whisper
import sounddevice as sd
import numpy as np
import queue
import threading
model = whisper.load_model("base")
audio_queue = queue.Queue()
def audio_callback(indata, frames, time, status):
audio_queue.put(indata.copy())
def transcribe_audio():
while True:
if not audio_queue.empty():
# Collect audio chunks
audio_data = []
for _ in range(10): # Collect ~1 second of audio
try:
chunk = audio_queue.get_nowait()
audio_data.append(chunk)
except queue.Empty:
break
if audio_data:
# Convert to numpy array
audio_array = np.concatenate(audio_data)
# Transcribe
result = model.transcribe(audio_array, fp16=False)
print(f"Transcription: {result['text']}")
# Start transcription thread
transcribe_thread = threading.Thread(target=transcribe_audio, daemon=True)
transcribe_thread.start()
# Start audio stream
with sd.InputStream(callback=audio_callback, channels=1, samplerate=16000):
print("Listening... Press Ctrl+C to stop")
try:
while True:
pass
except KeyboardInterrupt:
print("Stopped")Using Vosk (Offline)
Installation:
pip3 install voskUsage:
import json
import vosk
import sounddevice as sd
import queue
# Load model (download from https://alphacephei.com/vosk/models)
model = vosk.Model("path/to/vosk-model-en-us-0.22")
recognizer = vosk.KaldiRecognizer(model, 16000)
audio_queue = queue.Queue()
def audio_callback(indata, frames, time, status):
audio_queue.put(bytes(indata))
with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16', channels=1, callback=audio_callback):
print("Listening...")
while True:
data = audio_queue.get()
if recognizer.AcceptWaveform(data):
result = json.loads(recognizer.Result())
if result['text']:
print(f"Recognized: {result['text']}")Text-to-Speech
Using Piper TTS (Pre-installed)
The Field Kit voice assistant uses Piper TTS for high-quality offline text-to-speech:
# Piper TTS is included in the voice assistant
# Models are automatically downloaded on first usePython Usage:
from piper import PiperVoice
from piper.download import ensure_voice_exists, find_voice
# Download voice model (one-time)
ensure_voice_exists("en_GB-alba-medium", ["./models"])
# Initialize voice
voice = PiperVoice.load(find_voice("en_GB-alba-medium", ["./models"]))
# Generate speech
with open("output.wav", "wb") as f:
voice.synthesize("Hello, this is the Jetson Field Kit speaking", f)Using pyttsx3
Installation:
pip3 install pyttsx3Usage:
import pyttsx3
engine = pyttsx3.init()
# Set properties
engine.setProperty('rate', 150) # Speed
engine.setProperty('volume', 0.9) # Volume
# Speak
engine.say("Hello, this is the Jetson Field Kit speaking")
engine.runAndWait()Using gTTS (Google Text-to-Speech)
Installation:
pip3 install gtts playsoundUsage:
from gtts import gTTS
import os
text = "Hello, this is the Jetson Field Kit"
tts = gTTS(text=text, lang='en', slow=False)
tts.save("/tmp/tts_output.mp3")
os.system("mpg123 /tmp/tts_output.mp3") # Requires mpg123: sudo apt install mpg123Using Coqui TTS (High Quality)
Installation:
pip3 install TTSUsage:
from TTS.api import TTS
# Initialize TTS
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=True)
# Generate speech
tts.tts_to_file(text="Hello, this is the Jetson Field Kit", file_path="/tmp/output.wav")Building a Voice Assistant
Field Kit Voice Assistant
The Field Kit includes a pre-built voice assistant using LiveKit:
# Navigate to voice assistant directory
cd jetson-orin-nano-field-kit/apps/voice-assistant
# Set up virtual environment
python3 -m venv venv
source venv/bin/activate
# Install dependencies
pip install -r requirements.txt
# Configure environment
cp .env.example .env
nano .env # Add your API keys and configuration
# Run the voice assistant
python main.py devFeatures:
- Wake word detection ("nano")
- Tool calling for system commands
- Safe Linux command execution with whitelisting
- Integration with Kiwix offline knowledge base
- Vision plugin support for camera queries
- LLM plugin support (local Ollama or cloud APIs)
Simple Voice Assistant Framework
import whisper
import pyttsx3
import sounddevice as sd
import numpy as np
import queue
import threading
class SimpleVoiceAssistant:
def __init__(self):
print("Loading Whisper model...")
self.whisper_model = whisper.load_model("base")
self.tts_engine = pyttsx3.init()
self.tts_engine.setProperty('rate', 150)
self.audio_queue = queue.Queue()
self.running = False
def audio_callback(self, indata, frames, time, status):
if self.running:
self.audio_queue.put(indata.copy())
def listen(self, duration=3):
"""Listen for specified duration and return transcription"""
audio_data = []
sample_rate = 16000
print("Listening...")
with sd.InputStream(callback=self.audio_callback,
channels=1,
samplerate=sample_rate):
sd.sleep(int(duration * 1000))
# Collect all queued audio
while not self.audio_queue.empty():
audio_data.append(self.audio_queue.get())
if not audio_data:
return None
# Convert to numpy array
audio_array = np.concatenate(audio_data)
# Transcribe
result = self.whisper_model.transcribe(audio_array, fp16=False)
return result['text']
def speak(self, text):
"""Speak the given text"""
print(f"Speaking: {text}")
self.tts_engine.say(text)
self.tts_engine.runAndWait()
def process_command(self, command):
"""Process voice commands"""
command_lower = command.lower()
if "hello" in command_lower or "hi" in command_lower:
return "Hello! How can I help you?"
elif "time" in command_lower:
from datetime import datetime
return f"The current time is {datetime.now().strftime('%H:%M')}"
elif "weather" in command_lower:
return "I don't have weather information configured yet."
else:
return f"I heard you say: {command}"
def run(self):
"""Main loop"""
self.running = True
print("Voice Assistant started. Say 'quit' to exit.")
try:
while True:
# Listen for command
command = self.listen(duration=3)
if command:
print(f"You said: {command}")
if "quit" in command.lower() or "exit" in command.lower():
self.speak("Goodbye!")
break
# Process command
response = self.process_command(command)
self.speak(response)
except KeyboardInterrupt:
print("\nStopping...")
finally:
self.running = False
# Usage
assistant = SimpleVoiceAssistant()
assistant.run()Wake Word Detection
Using Porcupine (PicoVoice)
Installation:
pip3 install pvporcupineUsage:
import pvporcupine
import pyaudio
import struct
# Initialize Porcupine (requires access key from Picovoice)
porcupine = pvporcupine.create(
access_key='YOUR_ACCESS_KEY',
keywords=['picovoice', 'hey pico']
)
pa = pyaudio.PyAudio()
audio_stream = pa.open(
rate=porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=porcupine.frame_length
)
print("Listening for wake word...")
try:
while True:
pcm = audio_stream.read(porcupine.frame_length)
pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
keyword_index = porcupine.process(pcm)
if keyword_index >= 0:
print(f"Wake word detected: {porcupine.keywords[keyword_index]}")
# Trigger your voice assistant here
except KeyboardInterrupt:
print("Stopping...")
finally:
audio_stream.close()
pa.terminate()
porcupine.delete()Audio Processing
Noise Reduction
import numpy as np
import sounddevice as sd
from scipy import signal
def reduce_noise(audio, sample_rate=16000):
# Simple high-pass filter to remove low-frequency noise
sos = signal.butter(10, 300, 'hp', fs=sample_rate, output='sos')
filtered = signal.sosfilt(sos, audio)
return filtered
# Usage
duration = 5
sample_rate = 16000
print("Recording...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
sd.wait()
# Reduce noise
clean_audio = reduce_noise(audio.flatten(), sample_rate)
# Play cleaned audio
sd.play(clean_audio, sample_rate)
sd.wait()Voice Activity Detection
import numpy as np
import sounddevice as sd
def detect_voice_activity(audio, threshold=0.01):
"""Simple energy-based VAD"""
energy = np.mean(audio ** 2)
return energy > threshold
# Usage
sample_rate = 16000
chunk_duration = 0.5 # seconds
chunk_size = int(sample_rate * chunk_duration)
print("Listening for voice...")
while True:
chunk = sd.rec(chunk_size, samplerate=sample_rate, channels=1)
sd.wait()
if detect_voice_activity(chunk):
print("Voice detected!")
# Process the audio chunkIntegration with LLMs
See the Working with LLMs guide for integrating voice assistants with language models for more sophisticated conversations.
Performance Tips
- Use appropriate model sizes: Smaller Whisper models (tiny, base) are faster but less accurate
- Batch processing: Process multiple audio chunks together when possible
- GPU acceleration: Use CUDA-enabled models when available
- Audio buffering: Use queues to prevent audio dropouts
- Power mode: Ensure Jetson is in appropriate power mode (MAXN for best performance)
Next Steps
- Integrate with LLMs for smarter conversations
- Add Computer Vision for multimodal interactions
- Check Troubleshooting for audio issues
- Review Good Guidance for optimization