Implement model recognition based on the iPhone 14 Max camera, with LiDAR calculating the width and height.

基于iPhone 14 Max相机,实现模型识别,并在识别对象周围画一个矩形框。宽度和高度使用激光雷达计算,并在实时更新的图像上以厘米为单位显示。

import UIKit
import AVFoundation
import CoreML
import Vision
import ARKit
import Photos
import CoreMotion

class DoorsByYolov8mlcore: UIViewController, ARSCNViewDelegate, AVCaptureVideoDataOutputSampleBufferDelegate {
    
    // MARK: - Properties
    var sceneView: ARSCNView!
    var captureSession: AVCaptureSession!
    var videoPreviewLayer: AVCaptureVideoPreviewLayer!
    var model: VNCoreMLModel?
    let iphoneStatus = GetIPhoneStatusClass()
    
    // MARK: - Lifecycle Methods
    override func viewDidLoad() {
        super.viewDidLoad()
        
        checkPermissions()
        loadModel()
        
        if model == nil {
            print("WARNING: Model failed to load")
        }
        
        setupCamera()
        
        sceneView = ARSCNView(frame: self.view.frame)
        self.view.addSubview(sceneView)
        
        let configuration = ARWorldTrackingConfiguration()
        configuration.sceneReconstruction = .meshWithClassification
        sceneView.session.run(configuration)
        
        sceneView.delegate = self
        
        print("SceneView initialized: \(sceneView != nil)")
    }
    
    override func viewWillDisappear(_ animated: Bool) {
        super.viewWillDisappear(animated)
        captureSession?.stopRunning()
        sceneView.session.pause()
    }
    
    deinit {
        NotificationCenter.default.removeObserver(self)
    }
    
    // MARK: - Setup Methods
    private func checkPermissions() {
        switch AVCaptureDevice.authorizationStatus(for: .video) {
        case .authorized:
            break
        case .notDetermined:
            AVCaptureDevice.requestAccess(for: .video) { granted in
                if granted {
                    DispatchQueue.main.async {
                        self.setupCamera()
                    }
                }
            }
        default:
            showPermissionAlert()
        }
    }
    
    private func showPermissionAlert() {
        let alert = UIAlertController(
            title: "需要相机权限",
            message: "此应用需要访问您的相机来检测门。请在设置中允许相机访问权限。",
            preferredStyle: .alert
        )
        
        alert.addAction(UIAlertAction(
            title: "取消",
            style: .cancel
        ))
        
        alert.addAction(UIAlertAction(
            title: "前往设置",
            style: .default,
            handler: { _ in
                if let settingsURL = URL(string: UIApplication.openSettingsURLString) {
                    UIApplication.shared.open(settingsURL)
                }
            }
        ))
        
        DispatchQueue.main.async { [weak self] in
            self?.present(alert, animated: true)
        }
    }
    
    func setupCamera() {
        captureSession = AVCaptureSession()
        
        if captureSession.canSetSessionPreset(.hd1280x720) {
            captureSession.sessionPreset = .hd1280x720
        }
        
        guard let videoCaptureDevice = AVCaptureDevice.default(for: .video) else {
            print("No video device found")
            return
        }
        
        let videoDeviceInput: AVCaptureDeviceInput
        do {
            videoDeviceInput = try AVCaptureDeviceInput(device: videoCaptureDevice)
        } catch {
            print("Error accessing camera: \(error)")
            return
        }
        
        if captureSession.canAddInput(videoDeviceInput) {
            captureSession.addInput(videoDeviceInput)
        } else {
            print("Could not add video input")
            return
        }
        
        videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
        videoPreviewLayer.frame = view.bounds
        videoPreviewLayer.videoGravity = .resizeAspectFill
        view.layer.addSublayer(videoPreviewLayer)
        
        let videoDataOutput = AVCaptureVideoDataOutput()
        videoDataOutput.videoSettings = [
            kCVPixelBufferPixelFormatTypeKey as String: Int(kCVPixelFormatType_32BGRA)
        ]
        
        if captureSession.canAddOutput(videoDataOutput) {
            captureSession.addOutput(videoDataOutput)
            videoDataOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "videoQueue"))
        } else {
            print("Could not add video output")
            return
        }
        
        DispatchQueue.global(qos: .background).async {
            self.captureSession.startRunning()
        }
    }
    
    func loadModel() {
        guard let modelURL = Bundle.main.url(forResource: "doorsjj", withExtension: "mlmodelc") else {
            print("Error: doors.mlmodelc not found in bundle")
            return
        }
        
        do {
            let coremlModel = try MLModel(contentsOf: modelURL)
            self.model = try VNCoreMLModel(for: coremlModel)
            
            print("Model loaded successfully")
            
        } catch {
            print("Error loading CoreML model: \(error.localizedDescription)")
        }
    }
    
    func processVideoFrame(pixelBuffer: CVPixelBuffer) {
        let iphoneStatus = GetIPhoneStatusClass()
        let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
        
        let targetSize = CGSize(width: 736, height: 1280)
        let extent = ciImage.extent
        let scaleX = targetSize.width / extent.width
        let scaleY = targetSize.height / extent.height
        let scale = min(scaleX, scaleY)
        
        let scaledImage = ciImage.transformed(by: CGAffineTransform(scaleX: scale, y: scale))
        
        let croppedImage = scaledImage.cropped(to: CGRect(x: 0, y: 0, width: 720, height: 1280))
        let orientation = iphoneStatus.getCurrentDeviceOrientation()
        
        let rotatedImage: CIImage
        switch orientation {
        case .portrait:
            rotatedImage = croppedImage
        case .landscapeLeft:
            rotatedImage = croppedImage.transformed(by: CGAffineTransform(rotationAngle: -.pi / 2))
        case .landscapeRight:
            rotatedImage = croppedImage.transformed(by: CGAffineTransform(rotationAngle: .pi / 2))
        case .portraitUpsideDown:
            rotatedImage = croppedImage.transformed(by: CGAffineTransform(rotationAngle: .pi))
        default:
            rotatedImage = croppedImage.transformed(by: CGAffineTransform(rotationAngle: .pi / 2))
        }
        
        recognizeImage(image: rotatedImage)
    }
    
    func recognizeImage(image: CIImage) {
        guard let model = self.model else {
            print("Model not loaded")
            return
        }
        
        let request = VNCoreMLRequest(model: model) { [weak self] request, error in
            guard let self = self else { return }
            
            if let error = error {
                print("Detection error: \(error)")
                return
            }
            
            guard let results = request.results as? [VNRecognizedObjectObservation] else {
                print("No results or invalid type")
                return
            }
            
            if results.isEmpty {
                print("No objects detected in this frame")
            } else {
                var doorCount = 0
                var doorObservations: [VNRecognizedObjectObservation] = []
                for observation in results {
                    if let label = observation.labels.first?.identifier, label == "door" {
                        doorCount += 1
                        doorObservations.append(observation)
                    }
                }
                DispatchQueue.main.async {
                    self.drawDetections(doorObservations)
                }
            }
        }
        
        request.imageCropAndScaleOption = .scaleFit
        
        do {
            let handler = VNImageRequestHandler(ciImage: image, orientation: .up, options: [:])
            try handler.perform([request])
        } catch {
            print("Failed to perform detection: \(error)")
        }
    }
    
    func drawDetections(_ observations: [VNRecognizedObjectObservation]) {
        view.layer.sublayers?.removeAll(where: { $0.name == "detectionLayer" })
        
        guard let frame = sceneView.session.currentFrame else { return }
        
        for observation in observations {
            let boundingBox = observation.boundingBox
            let viewBounds = view.bounds
            let normalizedRect = VNImageRectForNormalizedRect(
                boundingBox,
                Int(viewBounds.width),
                Int(viewBounds.height)
            )
            
            let boxLayer = CALayer()
            boxLayer.name = "detectionLayer"
            boxLayer.frame = normalizedRect
            boxLayer.borderColor = UIColor.green.cgColor
            boxLayer.borderWidth = 2
            boxLayer.cornerRadius = 4
            
            view.layer.addSublayer(boxLayer)
            
            let dimensions = calculateDimensions(at: boundingBox, frame: frame)
            let width = dimensions.width
            let height = dimensions.height
            
            let label = UILabel(frame: CGRect(x: normalizedRect.minX, y: normalizedRect.minY - 20, width: 100, height: 20))
            label.text = String(format: "W: %.2fcm, H: %.2fcm", width, height)
            label.textColor = .white
            label.backgroundColor = .black.withAlphaComponent(0.5)
            label.font = UIFont.systemFont(ofSize: 12)
            view.addSubview(label)
        }
    }
    
    
    func calculateDimensions(at boundingBox: CGRect, frame: ARFrame) -> (width: Float, height: Float) {
        if let depthData = frame.sceneDepth {
            let depthMap = depthData.depthMap
            let topLeft = CGPoint(x: boundingBox.minX, y: boundingBox.minY)
            let topRight = CGPoint(x: boundingBox.maxX, y: boundingBox.minY)
            let bottomLeft = CGPoint(x: boundingBox.minX, y: boundingBox.maxY)
            let bottomRight = CGPoint(x: boundingBox.maxX, y: boundingBox.maxY)
            
            let topLeftPosition = unprojectPoint(topLeft, depthMap: depthMap, frame: frame)
            let topRightPosition = unprojectPoint(topRight, depthMap: depthMap, frame: frame)
            let bottomLeftPosition = unprojectPoint(bottomLeft, depthMap: depthMap, frame: frame)
            let bottomRightPosition = unprojectPoint(bottomRight, depthMap: depthMap, frame: frame)
            
            let width = distanceBetweenPoints(topLeftPosition, topRightPosition) * 100 // Convert to cm
            let height = distanceBetweenPoints(topLeftPosition, bottomLeftPosition) * 100 // Convert to cm
            
            return (width, height)
        }
        return (0, 0)
    }

    func unprojectPoint(_ point: CGPoint, depthMap: CVPixelBuffer, frame: ARFrame) -> simd_float4 {
        let pixel = CGPoint(x: point.x, y: point.y)
        
        // 获取深度值
        let depth = getDepthValue(at: pixel, from: depthMap)
        
        // 将像素位置与深度值一起转换为世界坐标
        let worldPosition = frame.camera.transform * simd_float4(Float(pixel.x), Float(pixel.y), Float(depth), 1)
        return worldPosition
    }

    func getDepthValue(at point: CGPoint, from depthMap: CVPixelBuffer) -> Float {
        // 获取深度图像素数据
        let width = CVPixelBufferGetWidth(depthMap)
        let height = CVPixelBufferGetHeight(depthMap)
        
        // 确保点在深度图的范围内
        guard point.x >= 0 && point.x < CGFloat(width) && point.y >= 0 && point.y < CGFloat(height) else {
            return 0.0
        }
        
        // 获取深度数据
        let baseAddress = CVPixelBufferGetBaseAddress(depthMap)
        let bytesPerRow = CVPixelBufferGetBytesPerRow(depthMap)
        
        let pixelAddress = baseAddress! + Int(point.y) * bytesPerRow + Int(point.x) * MemoryLayout.size
        let depthValue = pixelAddress.withMemoryRebound(to: Float.self, capacity: 1) { $0.pointee }
        
        return depthValue
    }
    func distanceBetweenPoints(_ p1: simd_float4, _ p2: simd_float4) -> Float {
           let dx = p1.x - p2.x
           let dy = p1.y - p2.y
           let dz = p1.z - p2.z
           return sqrt(dx * dx + dy * dy + dz * dz)
       }
}

Answered by DTS Engineer in 823622022

We recommend using ARKit's object scanning API for what you seek.

See Scanning and Detecting 3D Objects.

We recommend using ARKit's object scanning API for what you seek.

See Scanning and Detecting 3D Objects.

Implement model recognition based on the iPhone 14 Max camera, with LiDAR calculating the width and height.
 
 
Q