-
Conoce el framework Evaluations
Obtén información sobre cómo evaluar experiencias basadas en modelos utilizando el framework Evaluations. En un mundo probabilístico, las pruebas unitarias por sí solas no son suficientes. Descubre cómo definir métricas, calificar automáticamente los resultados y recopilar estadísticas para garantizar que tus funcionalidades basadas en IA funcionen de manera confiable en todas las plataformas de Apple.
Capítulos
- 0:00 - Introducción
- 3:10 - La app de demostración Book Tracker: una evaluación manual
- 4:31 - Cómo crear tu primera evaluación
- 8:06 - Ejecución de la evaluación y lectura del informe
- 10:57 - Cómo crear conjuntos de datos robustos
- 14:20 - Perfeccionamiento de métricas y evaluadores
- 15:41 - Desarrollo por evaluaciones y el método hill-climbing
- 16:12 - Modelos jueces: métricas cualitativas
- 18:42 - Cómo desarrollar un modelo juez
- 21:19 - Perfeccionamiento con dimensiones de puntuación
- 23:45 - Revisión de los resultados de las dimensiones
- 24:20 - Mejores prácticas
- 25:38 - Próximos pasos
Recursos
-
Buscar este video…
-
-
4:54 - Define an Evaluation
// Evaluations import Evaluations struct BookTaggingEvaluation: Evaluation { } -
8:02 - Run with Swift Testing and an optimization target
// Optimization Target @Test("Book Tag Evaluations", .evaluates(evaluation, info: evaluationInfo)) func evaluateBookTagging() async throws { let result = EvaluationContext.current.result let rangeMetric = BookTagEvaluationTests.evaluation.tagCount #expect(result.aggregateValue(.mean(of: rangeMetric)) >= 0.8) } -
10:09 - Constrain output with a Generable @Guide
// BookTags.swift @Generable struct BookTags: Codable { @Guide(description: "Descriptive tags capturing themes, genres, moods, and topics from the summary", .count(3...8)) var tags: [String] } snippet. -
11:15 - Define the dataset with ModelSample
// BookTaggingEvaluation var dataset = ArrayLoader(samples: [ ModelSample(prompt: "okay I am OBSESSED and I need everyone to read this RIGHT NOW...", expected: BookTags(tags: ["classic", "romance", "wit", "regency"])), ModelSample(prompt: "Read this in one sitting between midnight and 4am and I cannot...", expected: BookTags(tags: ["classic", "gothic", "horror", "vampire", "suspense"])), ]) // Or load your whole library: var dataset = ArrayLoader(samples: Book.sampleBooks.map { book in ModelSample(prompt: book.review, expected: BookTags(tags: book.tags)) } ) -
12:53 - Synthesize more samples with a SampleGenerator
// Synthesizing more inputs let samples: [ModelSample<String>] = [ ModelSample(prompt: "The largest planet in our solar system...", expected: "Jupiter."), ModelSample(prompt: "The capital of Thailand...", expected: "Bangkok."), ModelSample(prompt: "Swift is...", expected: "a powerful programming language."), ModelSample(prompt: "All those moments will be lost in time...", expected: "Like tears in rain.") ] for try await sample in samples.makeSamples( """ Generate diverse sentence completions about the listed topics: - The Solar System - World Capitals """, targetCount: 1000) { samples.append(sample) } -
14:02 - More evaluators: word count and genre
let wordCount = Metric("WordCount") Evaluator { _, subject in for tag in subject.value.tags { if tag.contains(" ") { return wordCount.failing(rationale: "Tag \(tag) contains multiple words") } } return wordCount.passing() } let hasGenreTag = Metric("HasGenreTag") Evaluator { _, subject in let tags = subject.value.tags.map { $0.lowercased() } let knownGenres = await BookTaggingService.knownGenres for tag in tags { if knownGenres.contains(tag) { return hasGenreTag.passing(rationale: "Matched \(tag)") } } return hasGenreTag.failing() } -
14:03 - Define a Metric and Evaluator
let tagCount = Metric("TagCount") var evaluators: Evaluators { // Tag count is within the required 3–8 range Evaluator { _, subject in let count = subject.value.tags.count if (count >= 3 && count <= 8) { return tagCount.passing(rationale: "\(count) tags") } return tagCount.failing(rationale: "Got \(count) tags, expected 3–8") } } -
14:27 - Aggregate metrics across samples
let tagCount = Metric("TagCount") let tagTotal = Metric("TagTotal") func aggregateMetrics(using aggregator: inout MetricsAggregator) { aggregator.computeMean(of: tagCount) aggregator.group("Distribution of Tag Totals") { aggregator in aggregator.computeStandardDeviation(of: tagTotal) aggregator.computeMean(of: tagTotal) aggregator.computeVariance(of: tagTotal) } } -
15:33 - Iterate the feature's instructions (hill-climbing)
// BookTaggingService.swift let instructions = Instructions { """ You are a librarian and literary analyst. Given a reader's freeform summary of a book they read — describing their thoughts, feelings, and what stood out — generate a set of descriptive tags reflected in the summary. Rules: - Return between 3 and 8 tags. - Tags should be lowercase, concise (single word or hyphenated), and descriptive. - Tags should include the book's genre, chosen from the included list of known genres. Known Genres: - \(Self.knownGenres.joined(separator: ", ")) """ } -
18:53 - Build a model judge
ModelJudgeEvaluator( "TagQuality", scale: .numeric([ 4: "Tags are relevant and helpful for browsing", 3: "Mostly relevant, one tag too vague or generic", 2: "Several tags are wrong or generic", 1: "Unhelpful or irrelevant" ]), judge: PrivateCloudComputeLanguageModel() ) -
22:17 - Split into score dimensions
// BookTaggingEvaluation.swift ScoreDimension( "Relevance", description: """ Whether each tag describes a quality, theme, or tone of the book itself rather than incidental details or the reader's personal reactions. """, scale: .numeric([ 4: "Every tag describes the book itself", 3: "Most tags describe the book", 2: "Some tags describe personal reactions", 1: "Tags don't meaningfully describe the book" ]) ) // Define `usefulness` the same way as a second ScoreDimension. -
22:32 - Add dimensions to the judge
// BookTaggingEvaluation.swift var evaluators: Evaluators { Evaluator { } Evaluator { } Evaluator { } ModelJudgeEvaluator( judge: PrivateCloudComputeLanguageModel(), dimensions: [relevance, usefulness] ) } -
23:17 - Add app context with a ModelJudgePrompt
// BookTaggingEvaluation.swift ModelJudgeEvaluator( judge: PrivateCloudComputeLanguageModel(), dimensions: [relevance, usefulness], prompt: ModelJudgePrompt( instructions: """ You are evaluating tags generated for a personal book-tracking app where users organize their library by browsing and filtering tags. """, evaluationTarget: { value in "\(value.tags.count) Generated tags: " + value.tags.joined(separator: ", ") }, reference: { input, _ in let expectedTags = input.expected?.tags.joined(separator: ", ") return ["Expected Tags": expectedTags ?? "No expected tags defined"] } ) )
-