Découvrez le framework Evaluations

Découvrez le framework Evaluations

Apprenez à évaluer des expériences fondées sur des modèles à l'aide du framework Evaluations. Dans un monde probabiliste, les tests unitaires ne suffisent pas. Découvrez comment définir des métriques, évaluer automatiquement les résultats et agréger des statistiques pour garantir que vos fonctionnalités d'IA fonctionnent de manière fiable sur les plateformes Apple.

Chapitres
- 0:00 - Introduction
- 3:10 - App de démonstration Book Tracker : une évaluation manuelle
- 4:31 - Créer votre première évaluation
- 8:06 - Exécuter l’évaluation et analyser le rapport
- 10:57 - Créer des jeux de données robustes
- 14:20 - Affiner les métriques et les évaluateurs
- 15:41 - Développement piloté par l’évaluation et hill-climbing
- 16:12 - Juges de modèles : métriques qualitatives
- 18:42 - Créer un juge de modèle
- 21:19 - Affiner avec des dimensions de score
- 23:45 - Analyser les résultats par dimension
- 24:20 - Bonnes pratiques
- 25:38 - Étapes suivantes
Ressources

// Evaluations
  import Evaluations

  struct BookTaggingEvaluation: Evaluation {
  
  }

8:02 - Run with Swift Testing and an optimization target

// Optimization Target
  @Test("Book Tag Evaluations", .evaluates(evaluation, info: evaluationInfo))
  func evaluateBookTagging() async throws {
      let result = EvaluationContext.current.result
  
      let rangeMetric = BookTagEvaluationTests.evaluation.tagCount
      #expect(result.aggregateValue(.mean(of: rangeMetric)) >= 0.8)
  }

10:09 - Constrain output with a Generable @Guide

// BookTags.swift
  @Generable
  struct BookTags: Codable {
      @Guide(description: "Descriptive tags capturing themes, genres, moods, and topics from the summary", .count(3...8))
      var tags: [String]
  } snippet.

11:15 - Define the dataset with ModelSample

// BookTaggingEvaluation
  var dataset = ArrayLoader(samples: [
      ModelSample(prompt: "okay I am OBSESSED and I need everyone to read this RIGHT NOW...",
                  expected: BookTags(tags: ["classic", "romance", "wit", "regency"])),

      ModelSample(prompt: "Read this in one sitting between midnight and 4am and I cannot...",
                  expected: BookTags(tags: ["classic", "gothic", "horror", "vampire", "suspense"])),
  ])
  
  // Or load your whole library:
  var dataset = ArrayLoader(samples:
      Book.sampleBooks.map { book in
          ModelSample(prompt: book.review, expected: BookTags(tags: book.tags))
      }
  )

12:53 - Synthesize more samples with a SampleGenerator

// Synthesizing more inputs
  let samples: [ModelSample<String>] = [
      ModelSample(prompt: "The largest planet in our solar system...", expected: "Jupiter."),
      ModelSample(prompt: "The capital of Thailand...", expected: "Bangkok."),
      ModelSample(prompt: "Swift is...", expected: "a powerful programming language."),
      ModelSample(prompt: "All those moments will be lost in time...", expected: "Like tears in rain.")
  ]
  
  for try await sample in samples.makeSamples(
      """
      Generate diverse sentence completions about the listed topics:
        - The Solar System
        - World Capitals 
      """,
      targetCount: 1000) {
          samples.append(sample)
  }

14:02 - More evaluators: word count and genre

let wordCount = Metric("WordCount")

  Evaluator { _, subject in
      for tag in subject.value.tags {
          if tag.contains(" ") {
              return wordCount.failing(rationale: "Tag \(tag) contains multiple words")
          }
      }
      return wordCount.passing()
  }

  let hasGenreTag = Metric("HasGenreTag")
  
  Evaluator { _, subject in
      let tags = subject.value.tags.map { $0.lowercased() }
      let knownGenres = await BookTaggingService.knownGenres
      for tag in tags {
          if knownGenres.contains(tag) {
              return hasGenreTag.passing(rationale: "Matched \(tag)")
          }
      }
      return hasGenreTag.failing() 
  }

14:03 - Define a Metric and Evaluator

let tagCount = Metric("TagCount")

  var evaluators: Evaluators {

      // Tag count is within the required 3–8 range
      Evaluator { _, subject in 
          let count = subject.value.tags.count
          if (count >= 3 && count <= 8) {
              return tagCount.passing(rationale: "\(count) tags")
          } 
          return tagCount.failing(rationale: "Got \(count) tags, expected 3–8")
      }
  }

14:27 - Aggregate metrics across samples

let tagCount = Metric("TagCount")
  let tagTotal = Metric("TagTotal")
  
  func aggregateMetrics(using aggregator: inout MetricsAggregator) {
      aggregator.computeMean(of: tagCount)
      aggregator.group("Distribution of Tag Totals") { aggregator in
          aggregator.computeStandardDeviation(of: tagTotal)
          aggregator.computeMean(of: tagTotal)
          aggregator.computeVariance(of: tagTotal)
      }
  }

15:33 - Iterate the feature's instructions (hill-climbing)

// BookTaggingService.swift
  let instructions = Instructions {
      """
      You are a librarian and literary analyst. Given a reader's
      freeform summary of a book they read — describing their
      thoughts, feelings, and what stood out — generate a set of
      descriptive tags reflected in the summary.

      Rules:
       - Return between 3 and 8 tags.
       - Tags should be lowercase, concise (single word or hyphenated), and descriptive.
       - Tags should include the book's genre, chosen from the included list of known genres.
  
      Known Genres:
       - \(Self.knownGenres.joined(separator: ", "))
      """
  }

18:53 - Build a model judge

ModelJudgeEvaluator(
      "TagQuality",
      scale: .numeric([
          4: "Tags are relevant and helpful for browsing",
          3: "Mostly relevant, one tag too vague or generic",
          2: "Several tags are wrong or generic",
          1: "Unhelpful or irrelevant"
      ]),   
      judge: PrivateCloudComputeLanguageModel()
  )

22:17 - Split into score dimensions

// BookTaggingEvaluation.swift
  ScoreDimension(
      "Relevance",
      description: """
          Whether each tag describes a quality, theme, or tone
          of the book itself rather than incidental details or
          the reader's personal reactions.
          """,
      scale: .numeric([
          4: "Every tag describes the book itself",
          3: "Most tags describe the book",
          2: "Some tags describe personal reactions",
          1: "Tags don't meaningfully describe the book"
      ])    
  )
  // Define `usefulness` the same way as a second ScoreDimension.

22:32 - Add dimensions to the judge

// BookTaggingEvaluation.swift
  var evaluators: Evaluators {

      Evaluator {  }  

      Evaluator {  }

      Evaluator {  }
  
      ModelJudgeEvaluator(
          judge: PrivateCloudComputeLanguageModel(),
          dimensions: [relevance, usefulness]
      )
  }

23:17 - Add app context with a ModelJudgePrompt

// BookTaggingEvaluation.swift
  ModelJudgeEvaluator(
      judge: PrivateCloudComputeLanguageModel(),
      dimensions: [relevance, usefulness],
      prompt: ModelJudgePrompt( 
          instructions: """
              You are evaluating tags generated for a personal book-tracking app where users
              organize their library by browsing and filtering tags.
              """,
          evaluationTarget: { value in
              "\(value.tags.count) Generated tags: " + value.tags.joined(separator: ", ")
          },
          reference: { input, _ in 
              let expectedTags = input.expected?.tags.joined(separator: ", ")
              return ["Expected Tags": expectedTags ?? "No expected tags defined"]
          }
      )
  )

Explore Get Started

Stay Updated

Explore Platforms

Featured

Explore Technologies

Featured

Explore Community

Featured

Explore Documentation

Release Notes

Explore Downloads

Featured

Explore Support

Featured

Quick Links

Chapitres

Ressources