Could string ranges obtained with a MLTokenizer be used outside of the enumerateTokens loop?

With the following function I sometime get "Thread 1: Fatal error: String index is out of bounds" errors at the substring.shuffled() call.

 func transform(_ text: String) -> String {
        var transformed = text
        if #available(macOS 10.14, iOS 12.0, *) {
            print("text: \"\(text)\"")
            var wordRanges: [Range<String.Index>] = []
            let tokenizer = NLTokenizer(unit: .word)
            tokenizer.string = transformed
            tokenizer.enumerateTokens(in: transformed.startIndex..<transformed.endIndex) { range, _ in
                if transformed[range].count > 3 {
                    let subrange = transformed.index(range.lowerBound, offsetBy: 1)..<transformed.index(range.upperBound, offsetBy: -1)
                    wordRanges.append(subrange)
                }
                return true
            }
            wordRanges.reversed().forEach { range in
                let substring = String(transformed[range])
                print("substring: \"\(substring)\"")
                transformed.replaceSubrange(range, with: substring.shuffled())
            }
        } else {
            let range = NSRange(transformed.startIndex..., in: transformed)
            let regex = try! NSRegularExpression(pattern: "\\W?\\w(\\w+)\\w\\W?", options: [])
            let matches = regex.matches(in: transformed, options: [], range: range)
            matches.reversed().forEach { match in
                guard let subrange = Range(match.range(at: 1), in: transformed) else {
                    return
                }
                let substring = transformed[subrange]
                transformed.replaceSubrange(subrange, with: substring.shuffled())
            }
        }
        return transformed
    }

The error appear in a macOS app with some strings but not all. I can't reproduce the error in a playground with the string that triggered the error.

When the error occurs, tprinted substrings are not correct.

Using Xcode 13.3.

Replies

With is a minimal macOS SwiftUI app and ContentView.swift with the following code, I can reproduce the crash with specific string.

//
//  ContentView.swift
//  NLTokenizerTest
//
//  Created by repoleved on 24/03/2022.
//

import SwiftUI
import NaturalLanguage

struct ContentView: View {
    @FocusState private var textIsFocused
    @State private var text = ""
    @State private var transformed = ""

    var body: some View {
        Form {
            Section(header: Text("Input")) {
                TextEditor(text: $text)
                    .font(.body)
                    .multilineTextAlignment(.leading)
                    .disableAutocorrection(true)
                    .frame(minHeight: 100)
                    .onChange(of: text, perform: { _ in
                        transformed = scramble(text)
                    })
                    .focused($textIsFocused)
            }
            Section(header: Text("Scrambled")) {
                TextEditor(text: $transformed)
                    .disabled(true)
                    .font(.body)
                    .multilineTextAlignment(.leading)
                    .frame(maxWidth: .infinity, minHeight: 100)
                    .allowsHitTesting(false)
            }
        }
        .frame(minWidth: 400)
        .padding()
        .onAppear(perform: { textIsFocused = true })
    }

    func scramble(_ text: String) -> String {
        var transformed = text
        print("text: \"\(text)\"")
        var wordRanges: [Range<String.Index>] = []
        let tokenizer = NLTokenizer(unit: .word)
        tokenizer.string = transformed
        tokenizer.enumerateTokens(in: transformed.startIndex..<transformed.endIndex) { range, _ in
            if transformed[range].count > 3 {
                let subrange = transformed.index(range.lowerBound, offsetBy: 1)..<transformed.index(range.upperBound, offsetBy: -1)
                wordRanges.append(subrange)
            }
            return true
        }
        wordRanges.reversed().forEach { range in
            let substring = String(transformed[range])
            print("substring: \"\(substring)\"")
            transformed.replaceSubrange(range, with: substring.shuffled())
        }
        return transformed
    }

}

struct ContentView_Previews: PreviewProvider {
    static var previews: some View {
        ContentView()
    }
}