-
Améliorez vos prompts grâce à l’optimisation par hill-climbing avec Evaluations
Découvrez des techniques d'évaluation comparative pour guider votre ingénierie des prompts et sélectionner le modèle adapté à votre app. Découvrez comment définir une référence de performance, élargir votre stratégie d'évaluation et convertir les résultats en JSON pour les intégrer à d'autres outils. Découvrez quand appliquer différentes stratégies de prompt et comment affiner les prompts de manière itérative pour obtenir les meilleurs résultats.
Chapitres
- 0:00 - Introduction
- 2:42 - Problème de balisage de BookTracker
- 5:27 - Analyse des résultats de l’évaluation
- 8:26 - Écart entre le juge et l’humain
- 9:37 - Mesure de la dérive à l’aide du coefficient kappa de Cohen
- 12:26 - Élaboration d’une évaluation de l’alignement du juge
- 15:16 - Analyse des échecs d’alignement
- 17:16 - Évaluation comparative : contrôle ou expérimental
- 19:12 - Affiner les dimensions d’évaluation
- 21:23 - Ajout d’exemples few-shot au juge
- 23:38 - Au-delà des prompts : ajouter un outil
- 27:17 - Étapes suivantes
Ressources
-
Rechercher dans cette vidéo…
-
-
3:54 - The BookTaggingEvaluation
// MARK: - Evaluation struct BookTaggingEvaluation: Evaluation { func subject(from sample: ModelSample<BookTags>) async throws -> ModelSubject<BookTags> { let result = try await BookTaggingService.generateTags(for: sample.promptDescription) return ModelSubject(value: result) } // MARK: - Dataset var dataset = ArrayLoader(samples: Book.sampleBooks.map { book in ModelSample(prompt: book.review, expected: BookTags(tags: book.tags)) } ) // MARK: - Evaluators & Metrics var tagCount = Metric("Tag Count") let hasGenreTag = Metric("Has Genre Tag") let noDuplicates = Metric("No Duplicates") let relevance = ScoreDimension( "Relevance", description: """ Whether each tag describes a quality, theme, or tone of the book itself rather than incidental details or the reader's personal reactions. """, scale: .numeric([ 4: "Every tag describes the book itself", 3: "Most tags describe the book, one picks up a reader reaction or minor detail", 2: "Most tags are surface details or personal reactions, not book descriptors", 1: "Tags don't meaningfully describe the book" ]) ) let usefulness = ScoreDimension( "Usefulness", description: """ Whether tags are at the right granularity for browsing — broad enough that multiple books could share the tag, specific enough to help filter. """, scale: .numeric([ 4: "Every tag could group multiple books while still narrowing a search", 3: "Most tags are at the right level, one is either too broad or too narrow", 2: "Most tags are too broad to filter or too narrow to group", 1: "Tags would not help with browsing" ]) ) var evaluators: Evaluators { // 1. Tag count is within the required 3–8 range Evaluator { _, subject in let count = subject.value.tags.count if (count >= 3 && count <= 8) { return tagCount.passing(rationale: "\(count) tags") } return tagCount.failing(rationale: "Got \(count) tags, expected 3–8") } // 2. At least one tag identifies the genre or literary form Evaluator { _, subject in let tags = subject.value.tags.map { $0.lowercased() } let knownGenres = await BookTaggingService.knownGenres for tag in tags { if knownGenres.contains(tag) { return hasGenreTag.passing(rationale: "Matched \(tag)") } } return hasGenreTag.failing() } // 3. No duplicate tags Evaluator { _, subject in let uniqueCount = Set(subject.value.tags.map { $0.lowercased() }).count if (subject.value.tags.count - uniqueCount) > 0 { return noDuplicates.failing(rationale: "Found \(subject.value.tags.count - uniqueCount) duplicates") } return noDuplicates.passing() } // 4. Overall tag quality — groundedness, coverage, specificity ModelJudgeEvaluator( judge: .default, dimensions: [relevance, usefulness], prompt: ModelJudgePrompt( instructions: """ You are evaluating automatically generated tags for Shelf, a personal book tracking app. Users write a short summary of their reading experience, and the app generates tags to make their library browsable. A good tag describes the book itself — its genre, themes, tone, or setting. A bad tag picks up incidental details or the reader's personal reactions that don't describe the book. """, evaluationTarget: { output in output.tags.joined(separator: ", ") }, reference: { input, _ in ["Expected Tags": input.expected?.tags.joined(separator: ", ") ?? ""] } ) ) } // MARK: - Analysis func aggregateMetrics(using aggregator: inout MetricsAggregator) { aggregator.group("Heuristics") { group in group.computeMean(of: tagCount) group.computeMean(of: hasGenreTag) group.computeMean(of: noDuplicates) } aggregator.group("Quality") { group in group.computeMean(of: relevance.metric) group.computeMean(of: usefulness.metric) } } } -
4:05 - Refined Relevance & Usefulness score dimensions
let relevance = ScoreDimension( "Relevance", description: """ Whether each tag describes the book itself — its genre, themes, tone, or setting — rather than the reader's reactions, meta- commentary about the review, or facts about the author. A book can be "suspenseful" (a property of the text); a reader is "exhausted" (a reaction). Mis-labeling the genre is a serious failure. """, scale: .numeric([ 4: "Every tag describes the book itself", 3: "Most tags describe the book, one picks up a reader reaction or minor detail", 2: "Most tags are surface details or personal reactions, not book descriptors", 1: "Tags don't meaningfully describe the book" ]) ) let usefulness = ScoreDimension( "Usefulness", description: """ Whether tags work as library shelf labels — broad enough that several books could plausibly share the tag, specific enough to meaningfully narrow a search. Standard genre and theme tags work; made-up phrases, character names, hyper-specific descriptors, and overly generic words like "interesting" don't. """, scale: .numeric([ 4: "Every tag could group multiple books while still narrowing a search", 3: "Most tags are at the right level, one is either too broad or too narrow", 2: "Most tags are too broad to filter or too narrow to group", 1: "Tags would not help with browsing" ]) ) -
11:56 - The alignment dataset, extracted to JSON
// Model judge alignment dataset [ { "input": "I have read this book more times than I can count…", "response": "[\"literary-fiction\", \"historical-fiction\", \"family-drama\", \"romantic-drama\", \"character-driven\", \"emotional-intensity\", \"multigenerational-narrative\", \"penned-by-a-woman\"]" } // ... add your expert ratings to each entry ] -
12:31 - The judge alignment evaluation: dataset, subject, evaluator
// Model judge alignment evaluation struct BookTagJudgmentCalibration: Evaluation { // MARK: Dataset — load the extracted summary/tag pairs static let samples: [ModelSample<BookTagJudgmentValue>] = { guard let url = Bundle(for: BundleToken.self).url( forResource: "BookTaggingEvaluation-extracted", withExtension: "json"), let data = try? Data(contentsOf: url) else { return [] } // Build ModelSample array (adding expert ratings) // ... }() var dataset: some Loader { ArrayLoader(samples: Self.samples) } // MARK: Capture Subject — tags are already generated, so just return them func subject(from sample: ModelSample<BookTagJudgmentValue>) async throws -> ModelSubject<BookTagJudgmentValue> { ModelSubject(value: sample.expected ?? BookTagJudgmentValue( tags: [], expertRelevanceScore: 0, expertUsefulnessScore: 0)) } // MARK: Evaluators — the same model judge as the book-tags evaluation var evaluators: Evaluators { ModelJudgeEvaluator( judge: .default, dimensions: [relevance, usefulness], prompt: ModelJudgePrompt( instructions: "You are evaluating automatically generated tags for Book Tracker…", evaluationTarget: { output in output.tags.joined(separator: ", ") }, reference: { input, _ in ["Expected Tags": input.expected?.tags.joined(separator: ", ") ?? ""] } ) ) } } -
13:00 - Cohen's kappa aggregation
func aggregateMetrics(using aggregator: inout MetricsAggregator) { let expertRelevance = Self.samples.map { Double($0.expected?.expertRelevanceScore ?? 0) } let expertUsefulness = Self.samples.map { Double($0.expected?.expertUsefulnessScore ?? 0) } aggregator.group("Relevance") { group in group.computeMean(of: relevance.metric) group.computeStandardDeviation(of: relevance.metric) group.custom(of: relevance.metric, label: "Relevance Alignment Score") { judge in cohensKappa(ratings1: expertRelevance, ratings2: judge) ?? 0 } } aggregator.group("Usefulness") { group in group.computeMean(of: usefulness.metric) group.computeStandardDeviation(of: usefulness.metric) group.custom(of: usefulness.metric, label: "Usefulness Alignment Score") { judge in cohensKappa(ratings1: expertUsefulness, ratings2: judge) ?? 0 } } } -
13:24 - The judge calibration test
// Model judge alignment tests @Suite("Book Tag Judge Calibration") struct BookTagJudgmentCalibrationTests { static let evaluation = BookTagJudgmentCalibration() @Test("Judge Calibration", .evaluates(evaluation)) func evaluateJudgeCalibration() async throws { let result = EvaluationContext.current.result let usefulnessMetric = BookTagJudgmentCalibrationTests.evaluation.usefulness.metric let relevanceMetric = BookTagJudgmentCalibrationTests.evaluation.relevance.metric #expect(result.aggregateValue(.custom(label: "Relevance: Judge vs Expert")) > 0.6) #expect(result.aggregateValue(.custom(label: "Usefulness: Judge vs Expert")) > 0.6) } } -
16:33 - The experimental judge prompt
// Experimental evaluation struct BookTagJudgmentCalibrationExperimental: Evaluation { var evaluators: Evaluators { ModelJudgeEvaluator( judge: .default, dimensions: [relevance, usefulness], prompt: ModelJudgePrompt( instructions: """ You are an experienced reader and librarian evaluating tags automatically generated for Book Tracker... Score the tag set on two independent dimensions: Relevance and Usefulness. ## What a good tag looks like - Genre/form, theme/subject, tone/atmosphere, setting/era ## Common failure modes - Reader reactions, meta-commentary, author facts, genre contradictions """, // ← full prompt is ~40 lines; abbreviated here evaluationTarget: { output in output.tags.joined(separator: ", ") }, reference: { input, _ in ["Book Review": input.promptDescription, "Tags Generated for the Review": input.expected?.tags.joined(separator: ", ") ?? ""] } ) ) } } -
20:12 - Few-shot worked examples in the judge prompt
struct ExperimentalBookTagJudgmentCalibration: Evaluation { var evaluators: Evaluators { ModelJudgeEvaluator( judge: SystemLanguageModel(), dimensions: [relevance, usefulness], prompt: ModelJudgePrompt( instructions: """ You are calibrating with an expert librarian who scores automatically generated tags for Book Tracker... Your goal is to match how the librarian scores. Use the worked examples to calibrate. ## Worked examples ### Example A — clean fit (Pride and Prejudice) Tags: romance, historical-fiction, love, redemption, passion Librarian: Relevance 4, Usefulness 4 ### Example E — flat genre contradiction (Frankenstein) Tags: horror, science-fiction, ... self-help, self-improvement Librarian: Relevance 2, Usefulness 3 ... (6 examples A–F; keep the set small to avoid overfitting) """, // ← full prompt is ~60 lines; abbreviated here evaluationTarget: { output in output.tags.joined(separator: ", ") }, reference: { input, _ in ["Book Review": input.promptDescription, "Tags Generated for the Review": input.expected?.tags.joined(separator: ", ") ?? ""] } ) ) } } 9. The BookLookupTool — slides 166–167 -
22:03 - The BookLookupTool
// Book Information Lookup Tool struct BookLookupTool: Tool { let name = "lookupBook" let description = "Looks up the title and author of a book given distinguishing details — such as character names, settings, quoted lines, or notable plot points — extracted from a reader's review." @Generable struct Arguments { @Guide(description: "Distinguishing details from the review that identify the book, such as character names, settings, quoted lines, or notable plot points.") var details: String } @Generable struct Output { @Guide(description: "The title of the identified book, or an empty string if no match was found.") var title: String @Guide(description: "The author of the identified book, or an empty string if no match was found.") var author: String } func call(arguments: Arguments) async throws -> Output { let needles = arguments.details .lowercased() .split(whereSeparator: { !$0.isLetter && !$0.isNumber }) .map(String.init) .filter { $0.count >= 4 } let best = Book.sampleBooks .map { book -> (book: Book, score: Int) in let review = book.review.lowercased() let score = needles.reduce(0) { partial, needle in partial + (review.contains(needle) ? 1 : 0) } return (book, score) } .max(by: { $0.score < $1.score }) guard let match = best, match.score > 0 else { return Output(title: "", author: "") } return Output(title: match.book.title, author: match.book.author) } } -
22:36 - BookTaggingService with a tools parameter
// Book Tagging Service struct BookTaggingService { static func generateTags(for review: String, tools: [any Tool] = []) async throws -> BookTags { let prompt = tagsPrompt(review: review) let session = LanguageModelSession( model: SystemLanguageModel(guardrails: .permissiveContentTransformations), tools: tools, instructions: instructions ) let response = try await session.respond(to: prompt, generating: BookTags.self) return response.content } } -
22:57 - Evaluation with the lookup tool
// Evaluation of tags with tool struct BookTaggingWithLookupEvaluation: Evaluation { func subject(from sample: ModelSample<BookTags>) async throws -> ModelSubject<BookTags> { let result = try await BookTaggingService.generateTags( for: sample.promptDescription, tools: [BookLookupTool()] ) return ModelSubject(value: result) } // ... same dataset, evaluators, and aggregation as BookTaggingEvaluation } -
23:09 - Compare with/without the tool in one suite
@Suite("Book Tag Evaluations") struct BookTagEvaluationTests { static let evaluation = BookTaggingEvaluation() static let lookupEvaluation = BookTaggingWithLookupEvaluation() @Test("Book Tag Evaluations", .evaluates(evaluation, info: evaluationInfo)) func evaluateBookTagging() async throws { let result = EvaluationContext.current.result let rangeMetric = BookTagEvaluationTests.evaluation.tagCount let dupeMetric = BookTagEvaluationTests.evaluation.noDuplicates #expect(result.aggregateValue(.mean(of: rangeMetric)) >= 0.8) #expect(result.aggregateValue(.mean(of: dupeMetric)) == 1) } @Test("Book Tag Evaluations (with BookLookupTool)", .evaluates(lookupEvaluation, info: lookupEvaluationInfo)) func evaluateBookTaggingWithLookup() async throws { let result = EvaluationContext.current.result let rangeMetric = BookTagEvaluationTests.lookupEvaluation.tagCount let dupeMetric = BookTagEvaluationTests.lookupEvaluation.noDuplicates #expect(result.aggregateValue(.mean(of: rangeMetric)) >= 0.8) #expect(result.aggregateValue(.mean(of: dupeMetric)) == 1) } }
-