-
Optimisez vos opérations d’apprentissage automatique personnalisées avec les tenseurs Metal
Libérez toute la puissance de l'apprentissage automatique avec l'API Metal Tensor et la bibliothèque Tensor Ops de Metal Performance Primitives (MPP). Découvrez comment créer des opérations portables qui exploitent les accélérateurs neuronaux intégrés aux GPU des puces Apple M5 et A19. Apprenez à développer des noyaux d'apprentissage automatique personnalisés pour vos applications Core AI, et découvrez comment optimiser l'utilisation de formats de données quantifiés et de la mémoire GPU.
Chapitres
- 0:00 - Introduction
- 0:21 - La pile logicielle d’apprentissage automatique d’Apple
- 2:25 - Gestion des données quantifiées
- 4:23 - Tenseurs multiplans
- 5:17 - Multiplication matricielle quantifiée
- 9:31 - Création d’opérations avancées
- 13:35 - Intégration d’opérations personnalisées dans Core AI
- 15:25 - Étapes suivantes
Ressources
- Running inline ML operations in a shader with Metal 4
- Machine learning passes
- Download the Metal Performance Primitives (MPP) Programming Guide
- Metal Performance Shaders
Vidéos connexes
Tech Talks
WWDC25
-
Rechercher dans cette vidéo…
-
-
3:53 - Create a quantized MTLTensor
// Creating a tensor with a quantized data type from device #define RANK 2 MTLTensorDescriptor *tensorDesc = [MTLTensorDescriptor new]; tensorDesc.dataType = MTLTensorDataTypeMetalFloat8E4M3; tensorDesc.usage = MTLTensorUsageCompute; NSInteger dimensions[RANK] = {NumCols, NumRows}; tensorDesc.dimensions = [[MTLTensorExtents alloc] initWithRank:RANK values:dimensions]; NSError *err = nil; id <MTLTensor> tensor = [device newTensorWithDescriptor:tensorDesc error:&err]; -
4:48 - Declare a multi-plane tensor with scale factors
// Creating a tensor with a scales auxiliary plane from device #define RANK 2 MTLTensorAuxiliaryPlaneDescriptor *planeDesc = [MTLTensorAuxiliaryPlaneDescriptor new]; planeDesc.dataType = MTLTensorDataTypeMetalFloat8UE8M0; NSInteger blockFactors[RANK] = {32, 1}; planeDesc.blockFactors = [[MTLTensorExtents alloc] initWithRank:RANK values:blockFactors]; MTLTensorAuxiliaryPlaneDescriptorMap *auxiliaryPlanes = [MTLTensorAuxiliaryPlaneDescriptorMap new]; [auxiliaryPlanes setDescriptor:planeDesc forPlane:MTLTensorPlaneTypeScales]; MTLTensorDescriptor *tensorDesc = [MTLTensorDescriptor new]; tensorDesc.dataType = MTLTensorDataTypeMetalFloat8E4M3; tensorDesc.usage = MTLTensorUsageCompute; NSInteger dimensions[RANK] = {NumCols, NumRows}; tensorDesc.dimensions = [[MTLTensorExtents alloc] initWithRank:RANK values:dimensions]; tensorDesc.auxiliaryPlanes = auxiliaryPlanes; NSError *err = nil; id <MTLTensor> tensor = [device newTensorWithDescriptor:tensorDesc error:&err]; -
6:07 - MSL type aliases for an MXFP8 tensor handle
// Type aliases for a MXFP8 multi-plane tensor handle #include <metal_tensor> using namespace metal; using scales_plane = tensor_blockwise<tensor_plane_scales, device metal_fp8_ue8m0_format, 32, 1>; using mxfp8_tensor = tensor<device metal_fp8_e4m3_format, dextents<int, 2>, tensor_handle, scales_plane>; kernel void matmul(mxfp8_tensor matrixA [[buffer(0)]], mxfp8_tensor matrixB [[buffer(1)]], tensor<device half, dextents<int, 2>> matrixC [[buffer(2)]]) { // ... } -
6:51 - Declare an inline MXFP8 tensor on the stack
// Type aliases for a MXFP8 multi-plane tensor inline #include <metal_tensor> using namespace metal; using scales_plane = tensor_blockwise<tensor_plane_scales, device metal_fp8_ue8m0_format, 32, 1>; using mxfp8_tensor_inline = tensor<device metal_fp8_e4m3_format, dextents<int, 2>, tensor_inline, scales_plane>; // Construct tensor on the stack from buffer pointers mxfp8_tensor_inline matrixA(dataBufferA, dextents<int, 2>(K, M), array<int, 2>({ 1, K }), scales_plane(scalesBufferA)); -
7:19 - Slice tensors and run a quantized matmul
// Slice the tensors to extract the relevant tile auto tA = matrixA.slice(0, tgid.y * TILEM); auto tB = matrixB.slice(tgid.x * TILEN, 0); auto tC = matrixC.slice(tgid.x * TILEN, tgid.y * TILEM); // Set up the matmul descriptor constexpr auto descriptor = matmul2d_descriptor(TILEM, // M TILEN, // N dynamic_length_v<int>, // K false, // Left matrix transposed false); // Right matrix transposed matmul2d<descriptor, execution_simdgroups<4>> op; // Run the op — TensorOps handles dequantization automatically op.run(tA, tB, tC); -
10:27 - Set up simdgroup-scoped QxK multiplication
// Setup QxK matrix multiplication op constexpr auto mul_qk_op_desc = matmul2d_descriptor(/* ... */); matmul2d<mul_qk_op_desc, execution_simdgroups> mul_qk_op; // Slice Q, K, V auto tQSlice = tQ.slice<D, ROWS_PER_SIMD>(0, sgid * ROWS_PER_SIMD); auto tKSlice = tK.slice<D, BK>(0, k); auto tVSlice = tV.slice<D, BK>(0, k); // Create cooperative tensor to store tile of QxK auto ctQK = mul_qk_op.get_destination_cooperative_tensor<decltype(tQSlice), decltype(tKSlice), float>(); // Multiply QxK mul_qk_op.run(tQSlice, tKSlice, ctQK); -
11:18 - Compute row-wise reduction for SoftMax
// Create a cooperative tensor to store row reduction output auto ctTileRowMax = mul_qk_op.get_row_reduction_destination_cooperative_tensor< decltype(tQSlice), decltype(tKSlice), float>(); // Compute max over each row of QxK tile reduce_rows(ctQK, ctTileRowMax, reduction_operation::max, -INFINITY); -
11:56 - Compute element-wise SoftMax with map_iterator
// Iterate over elements of QxK tile #pragma clang loop unroll(full) for (auto it = ctQK.begin(); it != ctQK.end(); it++) { // Fetch row max corresponding to this element auto row_it = ctRowMax.map_iterator(it); // Subtract row max from each element and compute exponent *it = exp(*it - *row_it); } -
12:33 - Reuse cooperative tensor as matmul input
constexpr auto mul_sv_op_desc = matmul2d_descriptor(/* ... */); matmul2d<mul_sv_op_desc, metal::execution_simdgroup> mul_sv_op; if (mul_sv_op.is_compatible_as_left_input<float, half, float>(ctQK)) { // Directly reuse cooperative tensor as input auto ctQKIn = mul_sv_op.get_left_input_cooperative_tensor<float, half, float>(ctQK); mul_sv_op.run(ctQKIn, tVSlice, ctO); } else { // Store and reload through threadgroup memory if layout is not compatible ctQK.store(tgTensor); simdgroup_barrier(mem_flags::mem_threadgroup); auto ctQKIn = mul_sv_op.get_left_input_cooperative_tensor<float, half, float>(); ctQKIn.load(tgTensor); mul_sv_op.run(ctQKIn, tVSlice, ctO); }
-