DockKit .track() has no effect using VNDetectFaceRectanglesRequest

Question

andreasteich OP

Created 3d

Replies 0

Boosts 0

Participants 1

Hi,

I'm testing DockKit with a very simple setup: I use VNDetectFaceRectanglesRequest to detect a face and then call dockAccessory.track(...) using the detected bounding box.

The stand is correctly docked (state == .docked) and dockAccessory is valid. I'm calling .track(...) with a single observation and valid CameraInformation (including size, device, orientation, etc.). No errors are thrown.

To monitor this, I added a logging utility – track(...) is being called 10–30 times per second, as recommended in the documentation.

However: the stand does not move at all. There is no visible reaction to the tracking calls.

Is there anything I'm missing or doing wrong? Is VNDetectFaceRectanglesRequest supported for DockKit tracking, or are there hidden requirements?

Would really appreciate any help or pointers – thanks!

That's my complete code:

 extension VideoFeedViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        guard let frame = CMSampleBufferGetImageBuffer(sampleBuffer) else {
            return
        }
        
        detectFace(image: frame)
        
        func detectFace(image: CVPixelBuffer) {
            let faceDetectionRequest = VNDetectFaceRectanglesRequest() { vnRequest, error in
                guard let results = vnRequest.results as? [VNFaceObservation] else {
                    return
                }
                
                guard let observation = results.first else {
                    return
                }
                    
                let boundingBoxHeight = observation.boundingBox.size.height * 100
                
                #if canImport(DockKit)
                if let dockAccessory = self.dockAccessory {
                    Task {
                        try? await trackRider(
                            observation.boundingBox,
                            dockAccessory,
                            frame,
                            sampleBuffer
                        )
                    }
                }
                #endif
            }
            
            let imageResultHandler = VNImageRequestHandler(cvPixelBuffer: image, orientation: .up)
            try? imageResultHandler.perform([faceDetectionRequest])
            
            func combineBoundingBoxes(_ box1: CGRect, _ box2: CGRect) -> CGRect {
                let minX = min(box1.minX, box2.minX)
                let minY = min(box1.minY, box2.minY)
                
                let maxX = max(box1.maxX, box2.maxX)
                let maxY = max(box1.maxY, box2.maxY)
                
                let combinedWidth = maxX - minX
                let combinedHeight = maxY - minY
                
                return CGRect(x: minX, y: minY, width: combinedWidth, height: combinedHeight)
            }
            
            #if canImport(DockKit)
            func trackObservation(_ boundingBox: CGRect, _ dockAccessory: DockAccessory, _ pixelBuffer: CVPixelBuffer, _ cmSampelBuffer: CMSampleBuffer) throws {
                // Zähle den Aufruf
                TrackMonitor.shared.trackCalled()
                
                let invertedBoundingBox = CGRect(
                    x: boundingBox.origin.x,
                    y: 1.0 - boundingBox.origin.y - boundingBox.height,
                    width: boundingBox.width,
                    height: boundingBox.height
                )
                
                guard let device = captureDevice else {
                    fatalError("Kamera nicht verfügbar")
                }
                
                let size = CGSize(width: Double(CVPixelBufferGetWidth(pixelBuffer)),
                                  height: Double(CVPixelBufferGetHeight(pixelBuffer)))
                
                var cameraIntrinsics: matrix_float3x3? = nil
                if let cameraIntrinsicsUnwrapped = CMGetAttachment(
                    sampleBuffer,
                    key: kCMSampleBufferAttachmentKey_CameraIntrinsicMatrix,
                    attachmentModeOut: nil
                ) as? Data {
                    cameraIntrinsics = cameraIntrinsicsUnwrapped.withUnsafeBytes { $0.load(as: matrix_float3x3.self) }
                }
                
                Task {
                    let orientation = getCameraOrientation()
                    
                    let cameraInfo = DockAccessory.CameraInformation(
                        captureDevice: device.deviceType,
                        cameraPosition: device.position,
                        orientation: orientation,
                        cameraIntrinsics: cameraIntrinsics,
                        referenceDimensions: size
                    )
                    
                    let observation = DockAccessory.Observation(
                        identifier: 0,
                        type: .object,
                        rect: invertedBoundingBox
                    )
                    
                    let observations = [observation]
                    
                    guard let image = CMSampleBufferGetImageBuffer(sampleBuffer) else {
                        print("no image")
                        return
                    }
                
                    do {
                        try await dockAccessory.track(observations, cameraInformation: cameraInfo)
                    } catch {
                        print(error)
                    }
                }
            }
            #endif
            
            func clearDrawings() {
                boundingBoxLayer?.removeFromSuperlayer()
                boundingBoxSizeLayer?.removeFromSuperlayer()
            }
        }
    }
}
 
@MainActor
private func getCameraOrientation() -> DockAccessory.CameraOrientation {
    switch UIDevice.current.orientation {
    case .portrait:
        return .portrait
    case .portraitUpsideDown:
        return .portraitUpsideDown
    case .landscapeRight:
        return .landscapeRight
    case .landscapeLeft:
        return .landscapeLeft
    case .faceDown:
        return .faceDown
    case .faceUp:
        return .faceUp
    default:
        return .corrected
    }
}

Boost

	extension VideoFeedViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
	func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
	guard let frame = CMSampleBufferGetImageBuffer(sampleBuffer) else {
	return
	}

	detectFace(image: frame)

	func detectFace(image: CVPixelBuffer) {
	let faceDetectionRequest = VNDetectFaceRectanglesRequest() { vnRequest, error in
	guard let results = vnRequest.results as? [VNFaceObservation] else {
	return
	}

	guard let observation = results.first else {
	return
	}

	let boundingBoxHeight = observation.boundingBox.size.height * 100

	#if canImport(DockKit)
	if let dockAccessory = self.dockAccessory {
	Task {
	try? await trackRider(
	observation.boundingBox,
	dockAccessory,
	frame,
	sampleBuffer
	)
	}
	}
	#endif
	}

	let imageResultHandler = VNImageRequestHandler(cvPixelBuffer: image, orientation: .up)
	try? imageResultHandler.perform([faceDetectionRequest])

	func combineBoundingBoxes(_ box1: CGRect, _ box2: CGRect) -> CGRect {
	let minX = min(box1.minX, box2.minX)
	let minY = min(box1.minY, box2.minY)

	let maxX = max(box1.maxX, box2.maxX)
	let maxY = max(box1.maxY, box2.maxY)

	let combinedWidth = maxX - minX
	let combinedHeight = maxY - minY

	return CGRect(x: minX, y: minY, width: combinedWidth, height: combinedHeight)
	}

	#if canImport(DockKit)
	func trackObservation(_ boundingBox: CGRect, _ dockAccessory: DockAccessory, _ pixelBuffer: CVPixelBuffer, _ cmSampelBuffer: CMSampleBuffer) throws {
	// Zähle den Aufruf
	TrackMonitor.shared.trackCalled()

	let invertedBoundingBox = CGRect(
	x: boundingBox.origin.x,
	y: 1.0 - boundingBox.origin.y - boundingBox.height,
	width: boundingBox.width,
	height: boundingBox.height
	)

	guard let device = captureDevice else {
	fatalError("Kamera nicht verfügbar")
	}

	let size = CGSize(width: Double(CVPixelBufferGetWidth(pixelBuffer)),
	height: Double(CVPixelBufferGetHeight(pixelBuffer)))

	var cameraIntrinsics: matrix_float3x3? = nil
	if let cameraIntrinsicsUnwrapped = CMGetAttachment(
	sampleBuffer,
	key: kCMSampleBufferAttachmentKey_CameraIntrinsicMatrix,
	attachmentModeOut: nil
	) as? Data {
	cameraIntrinsics = cameraIntrinsicsUnwrapped.withUnsafeBytes { $0.load(as: matrix_float3x3.self) }
	}

	Task {
	let orientation = getCameraOrientation()

	let cameraInfo = DockAccessory.CameraInformation(
	captureDevice: device.deviceType,
	cameraPosition: device.position,
	orientation: orientation,
	cameraIntrinsics: cameraIntrinsics,
	referenceDimensions: size
	)

	let observation = DockAccessory.Observation(
	identifier: 0,
	type: .object,
	rect: invertedBoundingBox
	)

	let observations = [observation]

	guard let image = CMSampleBufferGetImageBuffer(sampleBuffer) else {
	print("no image")
	return
	}

	do {
	try await dockAccessory.track(observations, cameraInformation: cameraInfo)
	} catch {
	print(error)
	}
	}
	}
	#endif

	func clearDrawings() {
	boundingBoxLayer?.removeFromSuperlayer()
	boundingBoxSizeLayer?.removeFromSuperlayer()
	}
	}
	}
	}

	@MainActor
	private func getCameraOrientation() -> DockAccessory.CameraOrientation {
	switch UIDevice.current.orientation {
	case .portrait:
	return .portrait
	case .portraitUpsideDown:
	return .portraitUpsideDown
	case .landscapeRight:
	return .landscapeRight
	case .landscapeLeft:
	return .landscapeLeft
	case .faceDown:
	return .faceDown
	case .faceUp:
	return .faceUp
	default:
	return .corrected
	}
	}