I think you were tired : )
Since you are comparing this:
func vectorCalc(t: Float, pts: float4) -> Float { ... }
to this:
func naiveCalc(t: Float, p: [Float]) -> Float { ... }
What I mean is: In naiveCalc, the type of parameter p is Array<Float>, which is very different from eg the MyFloat4 struct above.
Your test uses Array<Float> instead of MyFloat4 and MyFloat4x4 throughout. Array has just a reference to its storage as its only stored property, eg
sizeof(Array<Whatever>) is always 8 (at least on 64-bit platf) since it's just a reference, so it doesn't depend on the number of elements)
while
sizeof(float4) == 4 * 4 == 16 == sizeof(MyFloat4).
Using Array<Float> instead of structs like MyFloat4 and MyFloat4x4 has known performance overhead (which is surprisingly low, but still).
In short, I think that although using arrays instead of rolling your own structs (corresponding to the simd types) could be called a naive version, it's too naive, and it's not what I meant in the original post.
So I edited your example to use a MyFloat4 and MyFloat4x4 instead of [Float]:
import simd
import QuartzCore
let numLoops = 1_000_000 // Added a zero here compared to yours (and changed from maxLoops to numLoops).
// This is what I want to compare to simd's float4:
struct MyFloat4 {
var x: Float
var y: Float
var z: Float
var w: Float
init(_ x: Float, _ y: Float, _ z: Float, _ w: Float) { (self.x, self.y, self.z, self.w) = (x, y, z, w) }
init(_ elements: [Float]) {
guard elements.count == 4 else { fatalError("Illegal argument, must be exactly 4 elements") }
(self.x, self.y, self.z, self.w) = (elements[0], elements[1], elements[2], elements[3])
}
}
// This is what I want to compare to simd's float4x4:
struct MyFloat4x4 {
var row0: MyFloat4
var row1: MyFloat4
var row2: MyFloat4
var row3: MyFloat4
init(rows: [MyFloat4]) {
guard rows.count == 4 else { fatalError("Illegal argument, must be exactly 4 rows") }
(self.row0, self.row1, self.row2, self.row3) = (rows[0], rows[1], rows[2], rows[3])
}
}
// Scalar-Vector product:
func *(lhs: Float, rhs: MyFloat4) -> MyFloat4 { return MyFloat4(lhs * rhs.x, lhs * rhs.y, lhs * rhs.z, lhs * rhs.w) }
// Elementwise product of `lhs` and `rhs`. A.k.a. the Hadamard or Schur product of the two vectors:
func *(lhs: MyFloat4, rhs: MyFloat4) -> MyFloat4 { return MyFloat4(
lhs.x * rhs.x,
lhs.y * rhs.y,
lhs.z * rhs.z,
lhs.w * rhs.w
)
}
// Dot product:
func myDot(a: MyFloat4, _ b: MyFloat4) -> Float { return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w }
// Vector-Matrix multiplication:
func *(lhs: MyFloat4, rhs: MyFloat4x4) -> MyFloat4 { return MyFloat4(
myDot(lhs, MyFloat4(rhs.row0.x, rhs.row1.x, rhs.row2.x, rhs.row3.x)),
myDot(lhs, MyFloat4(rhs.row0.y, rhs.row1.y, rhs.row2.y, rhs.row3.y)),
myDot(lhs, MyFloat4(rhs.row0.z, rhs.row1.z, rhs.row2.z, rhs.row3.z)),
myDot(lhs, MyFloat4(rhs.row0.w, rhs.row1.w, rhs.row2.w, rhs.row3.w))
)
}
// Test functions:
let params = [float4(-1, 3, -3, 1), float4(2, -5, 4, -1),float4( -1, 0, 1, 0), float4(0, 2, 0, 0)]
let cm = float4x4(rows: params)
func vectorCalc(t: Float, pts: float4) -> Float {
let t2 = t * t
let t3 = t2 * t
let ts = float4(t3, t2, t, 1)
let res = 0.5 * ts * cm * pts
return res.w + res.x + res.y + res.z
}
let myParams = [MyFloat4(-1, 3, -3, 1), MyFloat4(2, -5, 4, -1), MyFloat4( -1, 0, 1, 0), MyFloat4(0, 2, 0, 0)]
let myCm = MyFloat4x4(rows: myParams)
func naiveCalc(t: Float, pts: MyFloat4) -> Float {
let t2 = t * t
let t3 = t2 * t
let ts = MyFloat4(t3, t2, t, 1)
let res = 0.5 * ts * myCm * pts
return res.w + res.x + res.y + res.z
}
// Test data:
var simdPointsArray = [float4]()
simdPointsArray.reserveCapacity(numLoops)
var myPointsArray = [MyFloat4]()
myPointsArray.reserveCapacity(numLoops)
for _ in 1...numLoops {
let testPoints = [
Float(arc4random_uniform(30)),
Float(arc4random_uniform(30)),
Float(arc4random_uniform(30)),
Float(arc4random_uniform(30)),
].sort()
simdPointsArray.append(float4(testPoints))
myPointsArray.append(MyFloat4(testPoints))
}
// Results:
var simdRes = Float(0)
var myRes = Float(0)
// Testing:
for _ in 0 ..< 10 {
let t1 = CACurrentMediaTime()
for simdVector in simdPointsArray { simdRes += vectorCalc(0, pts: simdVector) }
let t2 = CACurrentMediaTime()
for myVector in myPointsArray { myRes += naiveCalc (0, pts: myVector ) }
let t3 = CACurrentMediaTime()
print(String(format: "SIMD: %f (%e)", arguments: [t2 - t1, simdRes]))
print(String(format: "Naive: %f (%e)", arguments: [t3 - t2, myRes]))
print("")
}
And here's the results, showing that they are ~equally fast:
SIMD: 0.001965 (1.150876e+07)
Naive: 0.002088 (1.150876e+07)
SIMD: 0.001829 (2.301623e+07)
Naive: 0.001367 (2.301623e+07)
SIMD: 0.001587 (3.452249e+07)
Naive: 0.001360 (3.452249e+07)
SIMD: 0.001685 (4.602844e+07)
Naive: 0.001359 (4.602844e+07)
SIMD: 0.001498 (5.753438e+07)
Naive: 0.001361 (5.753438e+07)
SIMD: 0.001595 (6.903629e+07)
Naive: 0.001403 (6.903629e+07)
SIMD: 0.001614 (8.051771e+07)
Naive: 0.001399 (8.051771e+07)
SIMD: 0.001654 (9.199914e+07)
Naive: 0.001447 (9.199914e+07)
SIMD: 0.001642 (1.034806e+08)
Naive: 0.001430 (1.034806e+08)
SIMD: 0.001566 (1.149620e+08)
Naive: 0.001446 (1.149620e+08)
Even though I added a zero to numLoops, the times are pretty low, but increasing numLoops further didn't change the fact that Naive is as fast as SIMD. So I guess the optimizer has no problem with this example either, it inserts the SIMD instructions anyway, and the "only" benefit of using simd types rather than rolling your own is just that you don't have to roll your own.
(We seem to be running on very similar hardware btw, judging from the timings. : ) )