-
Bring advanced speech-to-text to your app with SpeechAnalyzer
Discover the new SpeechAnalyzer API for speech to text. We'll learn about the Swift API and its capabilities, which power features in Notes, Voice Memos, Journal, and more. We'll dive into details about how speech to text works and how SpeechAnalyzer and SpeechTranscriber can enable you to create exciting, performant features. And you'll learn how to incorporate SpeechAnalyzer and live transcription into your app with a code-along.
Chapters
- 0:00 - Introduction
- 2:41 - SpeechAnalyzer API
- 7:03 - SpeechTranscriber model
- 9:06 - Build a speech-to-text feature
Resources
Related Videos
WWDC23
-
Search this video…
-
-
5:21 - Transcribe a file
// Set up transcriber. Read results asynchronously, and concatenate them together. let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription) async let transcriptionFuture = try transcriber.results .reduce("") { str, result in str + result.text } let analyzer = SpeechAnalyzer(modules: [transcriber]) if let lastSample = try await analyzer.analyzeSequence(from: file) { try await analyzer.finalizeAndFinish(through: lastSample) } else { await analyzer.cancelAndFinishNow() } return try await transcriptionFuture -
11:02 - Speech Transcriber setup (volatile results + timestamps)
func setUpTranscriber() async throws { transcriber = SpeechTranscriber(locale: Locale.current, transcriptionOptions: [], reportingOptions: [.volatileResults], attributeOptions: [.audioTimeRange]) } -
11:47 - Speech Transcriber setup (volatile results, no timestamps)
// transcriber = SpeechTranscriber(locale: Locale.current, preset: .progressiveLiveTranscription) -
11:54 - Set up SpeechAnalyzer
func setUpTranscriber() async throws { transcriber = SpeechTranscriber(locale: Locale.current, transcriptionOptions: [], reportingOptions: [.volatileResults], attributeOptions: [.audioTimeRange]) guard let transcriber else { throw TranscriptionError.failedToSetupRecognitionStream } analyzer = SpeechAnalyzer(modules: [transcriber]) } -
12:00 - Get audio format
func setUpTranscriber() async throws { transcriber = SpeechTranscriber(locale: Locale.current, transcriptionOptions: [], reportingOptions: [.volatileResults], attributeOptions: [.audioTimeRange]) guard let transcriber else { throw TranscriptionError.failedToSetupRecognitionStream } analyzer = SpeechAnalyzer(modules: [transcriber]) self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber]) } -
12:06 - Ensure models
func setUpTranscriber() async throws { transcriber = SpeechTranscriber(locale: Locale.current, transcriptionOptions: [], reportingOptions: [.volatileResults], attributeOptions: [.audioTimeRange]) guard let transcriber else { throw TranscriptionError.failedToSetupRecognitionStream } analyzer = SpeechAnalyzer(modules: [transcriber]) self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber]) do { try await ensureModel(transcriber: transcriber, locale: Locale.current) } catch let error as TranscriptionError { print(error) return } } -
12:15 - Finish SpeechAnalyzer setup
func setUpTranscriber() async throws { transcriber = SpeechTranscriber(locale: Locale.current, transcriptionOptions: [], reportingOptions: [.volatileResults], attributeOptions: [.audioTimeRange]) guard let transcriber else { throw TranscriptionError.failedToSetupRecognitionStream } analyzer = SpeechAnalyzer(modules: [transcriber]) self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber]) do { try await ensureModel(transcriber: transcriber, locale: Locale.current) } catch let error as TranscriptionError { print(error) return } (inputSequence, inputBuilder) = AsyncStream<AnalyzerInput>.makeStream() guard let inputSequence else { return } try await analyzer?.start(inputSequence: inputSequence) } -
12:30 - Check for language support
public func ensureModel(transcriber: SpeechTranscriber, locale: Locale) async throws { guard await supported(locale: locale) else { throw TranscriptionError.localeNotSupported } } func supported(locale: Locale) async -> Bool { let supported = await SpeechTranscriber.supportedLocales return supported.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47)) } func installed(locale: Locale) async -> Bool { let installed = await Set(SpeechTranscriber.installedLocales) return installed.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47)) } -
12:39 - Check for model installation
public func ensureModel(transcriber: SpeechTranscriber, locale: Locale) async throws { guard await supported(locale: locale) else { throw TranscriptionError.localeNotSupported } if await installed(locale: locale) { return } else { try await downloadIfNeeded(for: transcriber) } } func supported(locale: Locale) async -> Bool { let supported = await SpeechTranscriber.supportedLocales return supported.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47)) } func installed(locale: Locale) async -> Bool { let installed = await Set(SpeechTranscriber.installedLocales) return installed.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47)) } -
12:52 - Download the model
func downloadIfNeeded(for module: SpeechTranscriber) async throws { if let downloader = try await AssetInventory.assetInstallationRequest(supporting: [module]) { self.downloadProgress = downloader.progress try await downloader.downloadAndInstall() } } -
13:19 - Deallocate an asset
func deallocate() async { let allocated = await AssetInventory.allocatedLocales for locale in allocated { await AssetInventory.deallocate(locale: locale) } } -
13:31 - Speech result handling
recognizerTask = Task { do { for try await case let result in transcriber.results { let text = result.text if result.isFinal { finalizedTranscript += text volatileTranscript = "" updateStoryWithNewText(withFinal: text) print(text.audioTimeRange) } else { volatileTranscript = text volatileTranscript.foregroundColor = .purple.opacity(0.4) } } } catch { print("speech recognition failed") } } -
15:13 - Set up audio recording
func record() async throws { self.story.url.wrappedValue = url guard await isAuthorized() else { print("user denied mic permission") return } #if os(iOS) try setUpAudioSession() #endif try await transcriber.setUpTranscriber() for await input in try await audioStream() { try await self.transcriber.streamAudioToTranscriber(input) } } -
15:37 - Set up audio recording via AVAudioEngine
#if os(iOS) func setUpAudioSession() throws { let audioSession = AVAudioSession.sharedInstance() try audioSession.setCategory(.playAndRecord, mode: .spokenAudio) try audioSession.setActive(true, options: .notifyOthersOnDeactivation) } #endif private func audioStream() async throws -> AsyncStream<AVAudioPCMBuffer> { try setupAudioEngine() audioEngine.inputNode.installTap(onBus: 0, bufferSize: 4096, format: audioEngine.inputNode.outputFormat(forBus: 0)) { [weak self] (buffer, time) in guard let self else { return } writeBufferToDisk(buffer: buffer) self.outputContinuation?.yield(buffer) } audioEngine.prepare() try audioEngine.start() return AsyncStream(AVAudioPCMBuffer.self, bufferingPolicy: .unbounded) { continuation in outputContinuation = continuation } } -
16:01 - Stream audio to SpeechAnalyzer and SpeechTranscriber
func streamAudioToTranscriber(_ buffer: AVAudioPCMBuffer) async throws { guard let inputBuilder, let analyzerFormat else { throw TranscriptionError.invalidAudioDataType } let converted = try self.converter.convertBuffer(buffer, to: analyzerFormat) let input = AnalyzerInput(buffer: converted) inputBuilder.yield(input) } -
16:29 - Finalize the transcript stream
try await analyzer?.finalizeAndFinishThroughEndOfInput()
-
-
- 0:00 - Introduction
Apple is introducing SpeechAnalyzer, a new speech-to-text API and technology in iOS 26, replacing SFSpeechRecognizer introduced in iOS 10. SpeechAnalyzer, built with Swift, is faster, more flexible, and supports long-form and distant audio, making it suitable for various use cases such as lectures, meetings, and conversations. The new API enables you to create live transcription features and is already powering system apps like Notes, Voice Memos, and Journal. When combined with Apple Intelligence, it facilitates powerful features like Call Summarization.
- 2:41 - SpeechAnalyzer API
The API design centers around the SpeechAnalyzer class, which manages analysis sessions. By adding a transcriber module, the session becomes a transcription session capable of performing speech-to-text processing. Audio buffers are passed to the analyzer instance, which routes them through the transcriber's speech-to-text model. The model predicts text and metadata, which are returned asynchronously to the application using Swift's async sequences. All API operations are scheduled using timecodes on the audio timeline, ensuring predictable order and independence. The transcriber delivers results in sequence, covering specific audio ranges. An optional feature allows iterative transcription within a range, providing immediate, though less accurate, "volatile results" for faster UI feedback, which are later refined into finalized results. Looking forward in this presentation, a use case is discussed that demonstrates how to create a transcriber module, set the locale, read audio from a file, concatenate results using async sequences, and return the final transcription as an attributed string. The API enables concurrent and asynchronous processing, decoupling audio input from results, and can be expanded to handle more complex needs across different views, models, and view models, which is demonstrated later.
- 7:03 - SpeechTranscriber model
Apple developed a new speech-to-text model for the SpeechTranscriber class, designed to handle various scenarios such as long-form recordings, meetings, and live transcriptions with low latency and high accuracy. The model operates entirely on-device, ensuring privacy and efficiency. It does not increase the app's size or memory usage and automatically updates. You can easily integrate the model into your applications using the AssetInventory API. The SpeechTranscriber class currently supports several languages and is available across most Apple platforms, with a fallback option, DictationTranscriber, provided for unsupported languages or devices.
- 9:06 - Build a speech-to-text feature
In iOS 18, the Notes app has been enhanced with new features that allow people to record and transcribe phone calls, live audio, and recorded audio. These features are integrated with Apple Intelligence to generate summaries. The Speech team developed SpeechAnalyzer and SpeechTranscriber, enabling high-quality, on-device transcription that is fast and accurate, even at distances. You can now use these tools to create your own customized transcription features. An example app is designed for kids; it records and transcribes bedtime stories. The app displays real-time transcription results and highlights the corresponding text segment during audio playback. To implement live transcription in an app, follow three main steps: configure the SpeechTranscriber with the appropriate locale and options, ensure the necessary speech-to-text model is downloaded and installed on the device, and then handle the transcription results as they are received via an AsyncStream. The results include both volatile (realtime guesses) and finalized text, allowing for smooth syncing between text and audio playback. When a finalized result is obtained, the 'volatileTranscript' is cleared, and the result is added to the 'finalizedTranscript' to prevent duplicates. The finalized result is also written to the Story model for later use and visualized with conditional formatting using SwiftUI AttributedString APIs. Set up audio input by requesting permission, starting the 'AVAudioSession', and configuring the AVAudioEngine to return an AsyncStream. The audio is written to disk and passed to the transcriber after being converted to the best available audio format. Upon stopping recording, the audio engine and transcriber are stopped, and any volatile results are finalized. The 'TranscriptView' displays the concatenation of finalized and volatile transcripts during recording and the final transcript from the data model during playback, with words highlighted in time with the audio. In the example app, Apple Intelligence is utilized to generate a title for the story using the FoundationModels API showing how you can use Apple Intelligence to perform useful transformations on the speech-to-text output. The Speech Framework enables the development of this app with minimal startup time, and further details can be found in its documentation.