I would say RayXC's theory of it being based around back propagation is true based on my fault call-stack.
Thread 25 Crashed:: Dispatch queue: metal gpu stream
4 Metal 0x1a8ca67a4 MTLReportFailure.cold.1 + 48
5 Metal 0x1a8c83348 MTLReportFailure + 464
19 MetalPerformanceShadersGraph 0x2081ab750 0x2080d5000 + 878416
20 MetalPerformanceShadersGraph 0x2081aac40 0x2080d5000 + 875584
21 libtorch_cpu.dylib 0x17304f6f0 invocation function for block in at::mps::MPSStream::executeMPSGraph(MPSGraph*, NSDictionary*, NSDictionary*, at::mps::SyncType) + 100
22 libdispatch.dylib 0x19f56c400 _dispatch_client_callout + 20
23 libdispatch.dylib 0x19f57b97c _dispatch_lane_barrier_sync_invoke_and_complete + 56
24 libtorch_cpu.dylib 0x17304f680 at::mps::MPSStream::executeMPSGraph(MPSGraph*, NSDictionary*, NSDictionary*, at::mps::SyncType) + 84
25 libtorch_cpu.dylib 0x17307ec34 at::native::mps_convolution_backward_weights(c10::ArrayRef<long long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long long>, c10::ArrayRef<long long>, c10::ArrayRef<long long>, long long, bool) + 2752
26 libtorch_cpu.dylib 0x17307fd14 at::native::mps_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long long>, c10::ArrayRef<long long>, c10::ArrayRef<long long>, long long, std::__1::array<bool, 3ul>) + 348
27 libtorch_cpu.dylib 0x16f9c7bd0 at::_ops::mps_convolution_backward::call(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long long>, c10::ArrayRef<long long>, c10::ArrayRef<long long>, long long, std::__1::array<bool, 3ul>) + 356
28 libtorch_cpu.dylib 0x16edd1ebc at::native::convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::OptionalArrayRef<long long>, c10::ArrayRef<long long>, c10::ArrayRef<long long>, c10::ArrayRef<long long>, bool, c10::ArrayRef<long long>, long long, std::__1::array<bool, 3ul>) + 5748
29 libtorch_cpu.dylib 0x16fc6853c at::_ops::convolution_backward::redispatch(c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::OptionalArrayRef<c10::SymInt>, c10::ArrayRef<long long>, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<long long>, bool, c10::ArrayRef<c10::SymInt>, long long, std::__1::array<bool, 3ul>) + 196
30 libtorch_cpu.dylib 0x17198445c torch::autograd::VariableType::(anonymous namespace)::convolution_backward(c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::OptionalArrayRef<c10::SymInt>, c10::ArrayRef<long long>, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<long long>, bool, c10::ArrayRef<c10::SymInt>, long long, std::__1::array<bool, 3ul>) + 3192
31 libtorch_cpu.dylib 0x16fc67fa4 at::_ops::convolution_backward::call(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::OptionalArrayRef<c10::SymInt>, c10::ArrayRef<long long>, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<long long>, bool, c10::ArrayRef<c10::SymInt>, long long, std::__1::array<bool, 3ul>) + 440
32 libtorch_cpu.dylib 0x17130735c torch::autograd::generated::ConvolutionBackward0::apply(std::__1::vector<at::Tensor, std::__1::allocator<at::Tensor>>&&) + 508
33 libtorch_cpu.dylib 0x17229a62c torch::autograd::Node::operator()(std::__1::vector<at::Tensor, std::__1::allocator<at::Tensor>>&&) + 120
34 libtorch_cpu.dylib 0x172291418 torch::autograd::Engine::evaluate_function(std::__1::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::__1::shared_ptr<torch::autograd::ReadyQueue> const&) + 2932
35 libtorch_cpu.dylib 0x1722902bc torch::autograd::Engine::thread_main(std::__1::shared_ptr<torch::autograd::GraphTask> const&) + 640
36 libtorch_cpu.dylib 0x17228efa0 torch::autograd::Engine::thread_init(int, std::__1::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 336
37 libtorch_python.dylib 0x112b29120 torch::autograd::python::PythonEngine::thread_init(int, std::__1::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 112
38 libtorch_cpu.dylib 0x17229d78c void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, void (torch::autograd::Engine::*)(int, std::__1::shared_ptr<torch::autograd::ReadyQueue> const&, bool), torch::autograd::Engine*, signed char, std::__1::shared_ptr<torch::autograd::ReadyQueue>, bool>>(void*) + 76
39 libsystem_pthread.dylib 0x19f71bfa8 _pthread_start + 148
40 libsystem_pthread.dylib 0x19f716da0 thread_start + 8