vAdd.c
/* |
File: vAdd.c |
Abstract: |
This module implements the vAdd routine shown in the session. It |
illustrates some techniques for using SIMD technology. |
Disclaimer: IMPORTANT: This Apple software is supplied to you by |
Apple Inc. ("Apple") in consideration of your agreement to the |
following terms, and your use, installation, modification or |
redistribution of this Apple software constitutes acceptance of these |
terms. If you do not agree with these terms, please do not use, |
install, modify or redistribute this Apple software. |
In consideration of your agreement to abide by the following terms, and |
subject to these terms, Apple grants you a personal, non-exclusive |
license, under Apple's copyrights in this original Apple software (the |
"Apple Software"), to use, reproduce, modify and redistribute the Apple |
Software, with or without modifications, in source and/or binary forms; |
provided that if you redistribute the Apple Software in its entirety and |
without modifications, you must retain this notice and the following |
text and disclaimers in all such redistributions of the Apple Software. |
Neither the name, trademarks, service marks or logos of Apple Inc. |
may be used to endorse or promote products derived from the Apple |
Software without specific prior written permission from Apple. Except |
as expressly stated in this notice, no other rights or licenses, express |
or implied, are granted by Apple herein, including but not limited to |
any patent rights that may be infringed by your derivative works or by |
other works in which the Apple Software may be incorporated. |
The Apple Software is provided by Apple on an "AS IS" basis. APPLE |
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION |
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS |
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND |
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL |
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, |
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED |
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), |
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE |
POSSIBILITY OF SUCH DAMAGE. |
Copyright (C) 2008 Apple Inc. All Rights Reserved. |
*/ |
#if defined __i386__ || defined __x86_64__ |
#include <emmintrin.h> |
#endif |
#include "vAdd.h" |
#include "vector.h" |
#define ElementsPerVector FloatsPerVector |
// Swap two things of type Type. |
#define Swap(a, b) { __typeof__(a) t = (a); (a) = (b); (b) = (t); } |
/* Calculate the encoding for a shuffle that selects elements e0, e1, e2, and |
e3 in that order for ascending addresses. |
*/ |
#define Shuffle4(e0, e1, e2, e3) \ |
((e0) << 0 | (e1) << 2 | (e2) << 4 | (e3) << 6) |
/* Calculate the encoding for a shuffle that selects elements e0 and e1 in |
that order for ascending addresses. |
*/ |
#define Shuffle2(e0, e1) ((e0) << 0 | (e1) << 1) |
/* We need to move parts of vector registers around, and the movss instruction |
is a handy way to do some of the manipulations we need. <xmmintrin.h> |
defines an instrinsic named _mm_move_ss for the movss instruction. |
The movss instruction is also available as a GCC extension: |
#define movss(v0, v1) __builtin_ia32_movss((v0), (v1)) |
*/ |
#define movss(v0, v1) _mm_move_ss((v0), (v1)) |
/* We need to move parts of vector registers around, and the shufpd |
instruction is a handy way to do some of the manipulations we need. |
<emmintrin.h> defines an intrinsic named _mm_shuffle_pd for the shufpd |
instruction. However, its return type is __m128d (a vector containing two |
doubles), so we must cast it to the vector type we are using. Since we are |
only moving data around and not interpreting the bits with arithmetic, this |
is okay. |
The selector passed to shufpd is a two-bit field. The first bit selects |
which eight bytes in the first parameter are taken (0 for the first eight |
bytes, 1 for the second eight bytes). The second bit selects which eight |
bytes in the second parameter are taken. |
The shufpd instruction is also available as a GCC extension. To use the |
GCC extension, the first two parameters must be cast to the correct type: |
#define shufpd(v0, v1, selector) ((vFloat) \ |
__builtin_ia32_shufpd((__m128d) (v0), (__m128d) (v1), (selector))) |
*/ |
#define shufpd(v0, v1, selector) \ |
((vFloat) _mm_shuffle_pd((v0), (v1), (selector))) |
/* We need to move parts of vector registers around, and the pshufd |
instruction is a handy way to do some of the manipulations we need. |
<emmintrin.h> defines an intrinsic named _mm_shuffle_epi32 for the pshufd |
instruction. However, its return type is __m128i (a vector containing |
integers), so we must cast it to the vector type we are using. Since we |
are only moving data around and not interpreting the bits with arithmetic, |
this is okay. |
The selector passed to pshufd is an eight-bit field. Each pair selects |
four bytes from the first parameter (0 for the first four bytes, 1 for the |
second four bytes, 2 for the third four bytes, and 3 for the last four |
bytes). |
The pshufd instruction is also available as a GCC extension. To use the |
GCC extension, the first two parameters must be cast to the correct type: |
#define pshufd(v0, selector) ((vFloat) \ |
__builtin_ia32_pshufd((__m128i) (v0), (selector))) |
*/ |
#define pshufd(v0, selector) ((vFloat) _mm_shuffle_epi32((v0), (selector))) |
/* vAddB00 implements vAdd given that each of A, B, and C is 0 modulo 16 and N |
is a multiple of ElementsPerVector. |
This routine uses GCC extensions to C that are not architecture-specific, |
so it is a good introduction to some simple vector/SIMD processing. |
*/ |
static inline void vAddB00(const float *A, const float *B, float *C, long N) |
{ |
/* Convert scalar pointers to vector pointers and scalar length |
(number of elements) to vector length (number of vector blocks). |
*/ |
const vFloat *vA = (const vFloat *) A; |
const vFloat *vB = (const vFloat *) B; |
vFloat *vC = ( vFloat *) C; |
long vN = N / ElementsPerVector; |
/* This main loop processes four vector blocks in each iteration, so we |
execute it as long as there are at least four more blocks to do. |
*/ |
while (4 <= vN) |
{ |
vN -= 4; |
vC[0] = vA[0] + vB[0]; |
vC[1] = vA[1] + vB[1]; |
vC[2] = vA[2] + vB[2]; |
vC[3] = vA[3] + vB[3]; |
vA += 4; |
vB += 4; |
vC += 4; |
} |
/* After the main loop, there could be zero, one, two, or three more |
vector blocks to process. We handle those here with code that is |
essentially a subset of the main loop. |
*/ |
if (2 <= vN) |
{ |
vN -= 2; |
vC[0] = vA[0] + vB[0]; |
vC[1] = vA[1] + vB[1]; |
vA += 2; |
vB += 2; |
vC += 2; |
} |
if (1 <= vN) |
vC[0] = vA[0] + vB[0]; |
/* Note: Ideally, we would use simple code such as: |
for (long i = 0; i < vN; ++i) |
vC[i] = vA[i] + vB[i]; |
That code works and is functionally identical to the above code in |
terms of what it computes. Unfortunately, GCC does not generate the |
best assembly code. For example, it uses separate registers for i and |
vN even though we can get the work done with just one. |
Discussing what GCC does and how to work with it is beyond the scope of |
this primer. It does require experimentation and studying the assembly |
language GCC generates. Also, work-arounds are subject to breaking |
when new versions of GCC are used or even when simple code changes are |
made. These are part of the reasons I prefer to use assembly language |
for core pieces of high-performance work. |
*/ |
} |
/* vAddB04 implements vAdd given that each of A and C is 0 modulo 16, B is 4 |
modulo 16, and N is a multiple of ElementsPerVector. |
This routine uses SIMD constructions that are specific to the Intel |
architectures. (That is, the IA-32 and EM64T architectures, called "i386" |
and "x86_64" by various Apple tools.) |
*/ |
static inline void vAddB04(const float *A, const float *B, float *C, long N) |
{ |
#if defined __i386__ || defined __x86_64__ // Architecture. |
if (N <= 0) |
return; |
/* Convert scalar pointers to vector pointers and scalar length |
(number of elements) to vector length (number of vector blocks). |
vB is set to point to the start of the aligned vector block into |
which B points. |
*/ |
const vFloat *vA = (const vFloat *) A; |
const vFloat *vB = (const vFloat *) (B-1); |
vFloat *vC = ( vFloat *) C; |
long vN = N / ElementsPerVector; |
/* Since B is unaligned with respect to vector blocks, we cannot load |
data directly from it using vector instructions. Instead, we will |
construct a pipeline or buffer of data coming from B. Aligned data |
will be loaded into the pipeline using vector instructions. Once |
the data is in registers, there are vector instructions we can use |
to shift it to be aligned. |
*/ |
// Prime the pipeline. |
vFloat pipe0 = vB[0]; |
/* This main loop processes four vector blocks in each iteration, so |
we execute it as long as there are at least four more blocks to do. |
*/ |
while (4 <= vN) |
{ |
vN -= 4; |
vFloat pipe1; |
pipe1 = vB[1]; |
pipe0 = movss(pipe0, pipe1); |
vFloat vb0 = pshufd(pipe0, Shuffle4(1, 2, 3, 0)); |
pipe0 = vB[2]; |
pipe1 = movss(pipe1, pipe0); |
vFloat vb1 = pshufd(pipe1, Shuffle4(1, 2, 3, 0)); |
pipe1 = vB[3]; |
pipe0 = movss(pipe0, pipe1); |
vFloat vb2 = pshufd(pipe0, Shuffle4(1, 2, 3, 0)); |
pipe0 = vB[4]; |
pipe1 = movss(pipe1, pipe0); |
vFloat vb3 = pshufd(pipe1, Shuffle4(1, 2, 3, 0)); |
vC[0] = vA[0] + vb0; |
vC[1] = vA[1] + vb1; |
vC[2] = vA[2] + vb2; |
vC[3] = vA[3] + vb3; |
vA += 4; |
vB += 4; |
vC += 4; |
} |
/* After the main loop, there could be zero, one, two, or three more |
vector blocks to process. We handle those here with code that is |
essentially a subset of the main loop. |
*/ |
if (2 <= vN) |
{ |
vN -= 2; |
vFloat pipe1; |
pipe1 = vB[1]; |
pipe0 = movss(pipe0, pipe1); |
vFloat vb0 = pshufd(pipe0, Shuffle4(1, 2, 3, 0)); |
pipe0 = vB[2]; |
pipe1 = movss(pipe1, pipe0); |
vFloat vb1 = pshufd(pipe1, Shuffle4(1, 2, 3, 0)); |
vC[0] = vA[0] + vb0; |
vC[1] = vA[1] + vb1; |
vA += 2; |
vB += 2; |
vC += 2; |
} |
if (1 <= vN) |
{ |
vFloat pipe1; |
pipe1 = vB[1]; |
pipe0 = movss(pipe0, pipe1); |
vFloat vb0 = pshufd(pipe0, Shuffle4(1, 2, 3, 0)); |
vC[0] = vA[0] + vb0; |
} |
#else // Architecture. |
/* For architectures for which we do not have SIMD code, use normal |
scalar code. |
*/ |
N = -N; |
A -= N; |
B -= N; |
C -= N; |
for (; N < 0; ++N) |
C[N] = A[N] + B[N]; |
#endif // Architecture. |
} |
/* vAddB08 implements vAdd given that each of A and C is 0 modulo 16, B is 8 |
modulo 16 and N is a multiple of ElementsPerVector. |
This routine uses SIMD constructions that are specific to the Intel |
architectures. (That is, the IA-32 and EM64T architectures, called "i386" |
and "x86_64" by various Apple tools.) |
*/ |
static inline void vAddB08(const float *A, const float *B, float *C, long N) |
{ |
#if defined __i386__ || defined __x86_64__ // Architecture. |
if (N <= 0) |
return; |
/* Convert scalar pointers to vector pointers and scalar length |
(number of elements) to vector length (number of vector blocks). |
vB is set to point to the start of the aligned vector block into |
which B points. |
*/ |
const vFloat *vA = (const vFloat *) A; |
const vFloat *vB = (const vFloat *) (B-2); |
vFloat *vC = ( vFloat *) C; |
long vN = N / ElementsPerVector; |
/* Since B is unaligned with respect to vector blocks, we cannot load |
data directly from it using vector instructions. Instead, we will |
construct a pipeline or buffer of data coming from B. Aligned data |
will be loaded into the pipeline using vector instructions. Once |
the data is in registers, there are vector instructions we can use |
to shift it to be aligned. |
*/ |
// Prime the pipeline. |
vFloat pipe0 = vB[0]; |
/* This main loop process four vector blocks in each iteration, so we |
execute it as long as there are at least four more blocks to do. |
*/ |
while (4 <= vN) |
{ |
vN -= 4; |
vFloat pipe1, pipe2, pipe3; |
// Load more data into the pipeline. |
pipe1 = vB[1]; |
/* Take the second eight bytes (index 1) from pipe0 and the |
first eight bytes (index 0) from pipe1 and put them together |
in vb0. |
*/ |
vFloat vb0 = shufpd(pipe0, pipe1, Shuffle2(1, 0)); |
// Add the shifted data from vB to vA and store in vC. |
vC[0] = vA[0] + vb0; |
/* Repeat the above three times, but note that we reuse pipe0 here |
as we load from vB[3]. The first eight bytes are used in the |
addition below. The second eight bytes remain in pipe0 for the |
next iteration of this loop. |
*/ |
pipe2 = vB[2]; |
vFloat vb1 = shufpd(pipe1, pipe2, Shuffle2(1, 0)); |
vC[1] = vA[1] + vb1; |
pipe3 = vB[3]; |
vFloat vb2 = shufpd(pipe2, pipe3, Shuffle2(1, 0)); |
vC[2] = vA[2] + vb2; |
pipe0 = vB[4]; |
vFloat vb3 = shufpd(pipe3, pipe0, Shuffle2(1, 0)); |
vC[3] = vA[3] + vb3; |
vA += 4; |
vB += 4; |
vC += 4; |
} |
/* After the main loop, there could be zero, one, two, or three more |
vector blocks to process. We handle those here with code that is |
essentially a subset of the main loop. |
*/ |
if (2 <= vN) |
{ |
vN -= 2; |
vFloat pipe1; |
pipe1 = vB[1]; |
vFloat vb0 = shufpd(pipe0, pipe1, Shuffle2(1, 0)); |
pipe0 = vB[2]; |
vFloat vb1 = shufpd(pipe1, pipe0, Shuffle2(1, 0)); |
vC[0] = vA[0] + vb0; |
vC[1] = vA[1] + vb1; |
vA += 2; |
vB += 2; |
vC += 2; |
} |
if (1 <= vN) |
{ |
vFloat pipe1; |
pipe1 = vB[1]; |
vFloat vb0 = shufpd(pipe0, pipe1, Shuffle2(1, 0)); |
vC[0] = vA[0] + vb0; |
} |
#else // Architecture. |
/* For architectures for which we do not have SIMD code, use normal |
scalar code. |
*/ |
N = -N; |
A -= N; |
B -= N; |
C -= N; |
for (; N < 0; ++N) |
C[N] = A[N] + B[N]; |
#endif // Architecture. |
} |
/* vAddB12 implements vAdd given that each of A and C is 0 modulo 16, B is 12 |
modulo 16 and N is a multiple of ElementsPerVector. |
This routine uses SIMD constructions that are specific to the Intel |
architectures. (That is, the IA-32 and EM64T architectures, called "i386" |
and "x86_64" by various Apple tools.) |
*/ |
static inline void vAddB12(const float *A, const float *B, float *C, long N) |
{ |
#if defined __i386__ || defined __x86_64__ // Architecture. |
if (N <= 0) |
return; |
/* Convert scalar pointers to vector pointers and scalar length |
(number of elements) to vector length (number of vector blocks). |
vB is set to point to the start of the aligned vector block into |
which B points. |
*/ |
const vFloat *vA = (const vFloat *) A; |
const vFloat *vB = (const vFloat *) (B-3); |
vFloat *vC = ( vFloat *) C; |
long vN = N / ElementsPerVector; |
/* This code works through the arrays backward, from high addresses to |
low addresses. This is because the Intel instruction set does not |
provide a good way to extract the four bytes we want from one |
register and the twelve we want from another without overwriting |
other bytes that we want to preserve. So we would have to copy the |
bytes to preserve them. Working backward allows us to use a |
different extraction pattern for which that are good instructions. |
*/ |
/* Since B is unaligned with respect to vector blocks, we cannot load |
data directly from it using vector instructions. Instead, we will |
construct a pipeline or buffer of data coming from B. Aligned data |
will be loaded into the pipeline using vector instructions. Once |
the data is in registers, there are vector instructions we can use |
to shift it to be aligned. |
*/ |
vA += vN; |
vB += vN; |
vC += vN; |
// Prime the pipeline. |
vFloat pipe3 = pshufd(vB[0], Shuffle4(3, 0, 1, 2)); |
/* This main loop process four vector blocks in each iteration, so we |
execute it as long as there are at least four more blocks to do. |
*/ |
while (4 <= vN) |
{ |
vB -= 4; |
vA -= 4; |
vC -= 4; |
vN -= 4; |
vFloat pipe0, pipe1, pipe2; |
pipe2 = pshufd(vB[3], Shuffle4(3, 0, 1, 2)); |
pipe3 = movss(pipe3, pipe2); |
vC[3] = vA[3] + pipe3; |
pipe1 = pshufd(vB[2], Shuffle4(3, 0, 1, 2)); |
pipe2 = movss(pipe2, pipe1); |
vC[2] = vA[2] + pipe2; |
pipe0 = pshufd(vB[1], Shuffle4(3, 0, 1, 2)); |
pipe1 = movss(pipe1, pipe0); |
vC[1] = vA[1] + pipe1; |
pipe3 = pshufd(vB[0], Shuffle4(3, 0, 1, 2)); |
pipe0 = movss(pipe0, pipe3); |
vC[0] = vA[0] + pipe0; |
} |
/* After the main loop, there could be zero, one, two, or three more |
vector blocks to process. We handle those here with code that is |
essentially a subset of the main loop. |
*/ |
if (2 <= vN) |
{ |
vN -= 2; |
vA -= 2; |
vB -= 2; |
vC -= 2; |
vFloat pipe2; |
pipe2 = pshufd(vB[1], Shuffle4(3, 0, 1, 2)); |
pipe3 = movss(pipe3, pipe2); |
vC[1] = vA[1] + pipe3; |
pipe3 = pshufd(vB[0], Shuffle4(3, 0, 1, 2)); |
pipe2 = movss(pipe2, pipe3); |
vC[0] = vA[0] + pipe2; |
} |
if (1 <= vN) |
{ |
vA -= 1; |
vB -= 1; |
vC -= 1; |
vFloat pipe2; |
pipe2 = pshufd(vB[0], Shuffle4(3, 0, 1, 2)); |
pipe3 = movss(pipe3, pipe2); |
vC[0] = vA[0] + pipe3; |
} |
#else // Architecture. |
/* For architectures for which we do not have SIMD code, use normal |
scalar code. |
*/ |
N = -N; |
A -= N; |
B -= N; |
C -= N; |
for (; N < 0; ++N) |
C[N] = A[N] + B[N]; |
#endif // Architecture. |
} |
// vAddScalar implements vAdd. |
static inline void vAddScalar( |
const float *A, const float *B, float *C, long N) |
{ |
#if 0 |
for (long i = 0; i < N; ++i) |
C[i] = A[i] + B[i]; |
#else |
/* Ideally, we would use the simple loop above, and the compiler |
would generate good assembly code for it. Unfortunately, GCC does |
not generate the best code, and sometimes we need to help it along. |
The code below eliminates a variable (it only needs N instead of |
both i and N), and this is enough to improve the execution time on |
a Xeon (with a particular version of GCC, et cetera). |
*/ |
N = -N; |
A -= N; |
B -= N; |
C -= N; |
for (; N < 0; ++N) |
C[N] = A[N] + B[N]; |
#endif |
} |
void vAdd(const float *A, const float *B, float *C, long N) |
{ |
/* Trim the front: Process individual elements at the start until the |
output is aligned. |
*/ |
for (; !IsAligned(C) && 0 < N; --N) |
*C++ = *A++ + *B++; |
/* Trim the back: Process individual elements at the end until the number |
of elements left is a multiple of the number in a vector block. This |
allows us to execute the main loop later without worrying about |
residue. |
*/ |
while (N % ElementsPerVector) |
{ |
--N; |
C[N] = A[N] + B[N]; |
} |
/* Sort A and B according to their residues, to reduce the number of cases |
we have to test. (Obviously, this works only with commutative |
operations.) |
*/ |
if (Residue(B) < Residue(A)) |
Swap(A, B); |
// If A is vector-block aligned, dispatch based on residue of B. |
if (IsAligned(A)) |
switch (Residue(B)) |
{ |
case 0: vAddB00(A, B, C, N); return; |
case 4: vAddB04(A, B, C, N); return; |
case 8: vAddB08(A, B, C, N); return; |
case 12: vAddB12(A, B, C, N); return; |
} |
// Neither A nor B is aligned. |
vAddScalar(A, B, C, N); |
} |
Copyright © 2008 Apple Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2008-06-06