TOMS_transpose.c

/*
 * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */
 
/*
 * TOMS Transpose.  Revised version of algorithm 380.
 * 
 * These routines do in-place transposes of arrays.
 * 
 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, 
 *   vol. 3, no. 1, 104-110 (1977) ]
 * 
 * C version by Steven G. Johnson. February 1997.
 */
 
/*
 * Changes made by Advanced Computation Group July 2004
 * Copyright (c) 2004 Apple Computer, Inc
 */
 
#include <stdlib.h>
#include <string.h>
 
#include "TOMS_transpose.h"
#include "dist_fft_transpose.h"
 
static int TOMS_gcd(int a, int b);
 
/*
 * "a" is a 1D array of length ny*nx which constains the nx x ny matrix to be
 * transposed.  "a" is stored in C order (last index varies fastest).  move
 * is a 1D array of length move_size used to store information to speed up
 * the process.  The value move_size=(ny+nx)/2 is recommended.
 * 
 * The return value indicates the success or failure of the routine. Returns 0
 * if okay, -1 if ny or nx < 0, and -2 if move_size < 1. The return value
 * should never be positive, but it it is, it is set to the final position in
 * a when the search is completed but some elements have not been moved.
 * 
 * Note: move[i] will stay zero for fixed points.
 */
 
short TOMS_transpose_2d(TOMS_el_type * a,
                        int nx, int ny,
                        char *move,
                        int move_size)
{
        int             i, j, im, mn;
        TOMS_el_type    b, c, d;
        int             ncount;
        int             k;
 
        /* check arguments and initialize: */
        if (ny < 0 || nx < 0)
                return -1;
        if (ny < 2 || nx < 2)
                return 0;
        if (move_size < 1)
                return -2;
 
 
        if (ny == nx) {
 
                // if it's a multiple of 4, and vector aligned memory, use vector
                if ((ny & 0x3) == 0 &&
                    ((((long)a) & 0xF) == 0) &&
                    (sizeof(TOMS_el_type) == sizeof(float))) {
                    
                    // and we're absolutely sure we're using floats
                    float *a_float = (float *)a;
                    
                    dist_fft_transpose_square(a_float, ny, ny*sizeof(float));
                    return 0;
                }
 
                /*
                 * if matrix is square, exchange elements a(i,j) and a(j,i):
                 */
                for (i = 0; i < nx; ++i)
                        for (j = i + 1; j < nx; ++j) {
                                b = a[i + j * nx];
                                a[i + j * nx] = a[j + i * nx];
                                a[j + i * nx] = b;
                        }
                return 0;
        }
        ncount = 2;             /* always at least 2 fixed points */
        k = (mn = ny * nx) - 1;
 
        for (i = 0; i < move_size; ++i)
                move[i] = 0;
 
        if (ny >= 3 && nx >= 3)
                ncount += TOMS_gcd(ny - 1, nx - 1) - 1; /* # fixed points */
 
        i = 1;
        im = ny;
 
        while (1) {
                int             i1, i2, i1c, i2c;
                int             kmi;
 
                /** Rearrange the elements of a loop
                      and its companion loop: **/
 
                i1 = i;
                kmi = k - i;
                b = a[i1];
                i1c = kmi;
                c = a[i1c];
 
                while (1) {
                        i2 = ny * i1 - k * (i1 / nx);
                        i2c = k - i2;
                        if (i1 < move_size)
                                move[i1] = 1;
                        if (i1c < move_size)
                                move[i1c] = 1;
                        ncount += 2;
                        if (i2 == i)
                                break;
                        if (i2 == kmi) {
                                d = b;
                                b = c;
                                c = d;
                                break;
                        }
                        a[i1] = a[i2];
                        a[i1c] = a[i2c];
                        i1 = i2;
                        i1c = i2c;
                }
                a[i1] = b;
                a[i1c] = c;
 
                if (ncount >= mn)
                        break;  /* we've moved all elements */
 
                /** Search for loops to rearrange: **/
 
                while (1) {
                        int             max;
 
                        max = k - i;
                        ++i;
                        if (i > max)
                                return i;
                        im += ny;
                        if (im > k)
                                im -= k;
                        i2 = im;
                        if (i == i2)
                                continue;
                        if (i >= move_size) {
                                while (i2 > i && i2 < max) {
                                        i1 = i2;
                                        i2 = ny * i1 - k * (i1 / nx);
                                }
                                if (i2 == i)
                                        break;
                        } else if (!move[i])
                                break;
                }
        }
 
        return 0;
}
 
/*
 * "a" is a 1D array of length ny*nx which constains the nx x ny matrix to be
 * transposed.  "a" is stored in C order (last index varies fastest).  move
 * is a 1D array of length move_size used to store information to speed up
 * the process.  The value move_size=(ny+nx)/2 is recommended.
 * 
 * Here, instead of each element of "a" being a single value of type
 * TOMS_el_type, each element is el_size values of type TOMS_el_type.
 * 
 * The return value indicates the success or failure of the routine. Returns 0
 * if okay, -1 if ny or nx < 0, and -2 if move_size < 1. Also, returns -3 if
 * it ran out of memory.  The return value should never be positive, but it
 * it is, it is set to the final position in a when the search is completed
 * but some elements have not been moved.
 * 
 * Note: move[i] will stay zero for fixed points.
 */
short TOMS_transpose_2d_arbitrary(TOMS_el_type * a,
                                  int nx, int ny,
                                  int el_size,
                                  char *move,
                                  int move_size)
{
        int             i, j, im, mn;
        TOMS_el_type   *b, *c, *d;
        int             ncount;
        int             k;
 
        /* check arguments and initialize: */
        if (ny < 0 || nx < 0)
                return -1;
        if (ny < 2 || nx < 2 || el_size < 1)
                return 0;
        if (move_size < 1)
                return -2;
 
        b = (TOMS_el_type *) malloc(sizeof(TOMS_el_type) * el_size);
        if (!b)
                return -3;
 
        if (ny == nx) {
                /*
                 * if matrix is square, exchange elements a(i,j) and a(j,i):
                 */
                for (i = 0; i < nx; ++i)
                        for (j = i + 1; j < nx; ++j) {
                                memcpy(b, &a[el_size * (i + j * nx)], el_size * sizeof(TOMS_el_type));
                                memcpy(&a[el_size * (i + j * nx)], &a[el_size * (j + i * nx)], el_size * sizeof(TOMS_el_type));
                                memcpy(&a[el_size * (j + i * nx)], b, el_size * sizeof(TOMS_el_type));
                        }
                free(b);
                return 0;
        }
        c = (TOMS_el_type *) malloc(sizeof(TOMS_el_type) * el_size);
        if (!c) {
                free(b);
                return -3;
        }
        ncount = 2;             /* always at least 2 fixed points */
        k = (mn = ny * nx) - 1;
 
        for (i = 0; i < move_size; ++i)
                move[i] = 0;
 
        if (ny >= 3 && nx >= 3)
                ncount += TOMS_gcd(ny - 1, nx - 1) - 1; /* # fixed points */
 
        i = 1;
        im = ny;
 
        while (1) {
                int             i1, i2, i1c, i2c;
                int             kmi;
 
                /** Rearrange the elements of a loop
                      and its companion loop: **/
 
                i1 = i;
                kmi = k - i;
                memcpy(b, &a[el_size * i1], el_size * sizeof(TOMS_el_type));
                i1c = kmi;
                memcpy(c, &a[el_size * i1c], el_size * sizeof(TOMS_el_type));
 
                while (1) {
                        i2 = ny * i1 - k * (i1 / nx);
                        i2c = k - i2;
                        if (i1 < move_size)
                                move[i1] = 1;
                        if (i1c < move_size)
                                move[i1c] = 1;
                        ncount += 2;
                        if (i2 == i)
                                break;
                        if (i2 == kmi) {
                                d = b;
                                b = c;
                                c = d;
                                break;
                        }
                        memcpy(&a[el_size * i1], &a[el_size * i2], 
                               el_size * sizeof(TOMS_el_type));
                        memcpy(&a[el_size * i1c], &a[el_size * i2c], 
                               el_size * sizeof(TOMS_el_type));
                        i1 = i2;
                        i1c = i2c;
                }
                memcpy(&a[el_size * i1], b, el_size * sizeof(TOMS_el_type));
                memcpy(&a[el_size * i1c], c, el_size * sizeof(TOMS_el_type));
 
                if (ncount >= mn)
                        break;  /* we've moved all elements */
 
                /** Search for loops to rearrange: **/
 
                while (1) {
                        int             max;
 
                        max = k - i;
                        ++i;
                        if (i > max) {
                                free(b);
                                free(c);
                                return i;
                        }
                        im += ny;
                        if (im > k)
                                im -= k;
                        i2 = im;
                        if (i == i2)
                                continue;
                        if (i >= move_size) {
                                while (i2 > i && i2 < max) {
                                        i1 = i2;
                                        i2 = ny * i1 - k * (i1 / nx);
                                }
                                if (i2 == i)
                                        break;
                        } else if (!move[i])
                                break;
                }
        }
 
        free(b);
        free(c);
        return 0;
}
 
static int TOMS_gcd(int a, int b)
{
        int r;
        do {
                r = a % b;
                a = b;
                b = r;
        } while (r != 0);
 
        return a;
}