/* 
 *	Sample code to illustrate advanced code scheduling techniques
 *	by Ian Ollmann, Ph.D. 
 *
 *		Copyright © Apple Computer 2003. All rights reserved. 
 *
 *	This code illustrates different ways to schedule a complex operation
 *	(short to float conversion) and times each for performance. The results
 *	of each method are checked for correctness. 
 */

#include <sys/types.h>

// Some fast timing functions:
int InitTimer( void );
u_int64_t startclock( void );
double stopclock( u_int64_t startTime );

//Our test functions
void ConvertInt16ToFloat_Compiler( u_int16_t *in, float *out, u_int32_t count );
void ConvertInt16ToFloat_Simple( u_int16_t *in, float *out, u_int32_t count );
void ConvertInt16ToFloat_SoftwarePipelined( u_int16_t *in, float *out, u_int32_t count );
void ConvertInt16ToFloat_SoftwarePipelined_EmptyStage( u_int16_t *in, float *out, u_int32_t count );


// Here we define our pipeline stages
#define  STAGE_1		stage1_result = (in++)[0];	i++             /* lhz -- load the integer into register */
#define  STAGE_2		stage2_result = stage1_result | expI            /* or  -- or it with 0x4B000000UL */
#define  STAGE_3		(stage3_target++)[0] = stage2_result		/* stw -- store it back out to the float array */
#define  STAGE_4		stage4_result = (stage4_src++)[0]               /* lfs -- load the value into a FP register*/
#define  STAGE_5		stage5_result = stage4_result - expF		/* fsub -- subtract the exponent from it */
#define  STAGE_6		(stage6_target++)[0] = stage5_result		/* stfs -- store the result back out */
#define  STAGE_COUNT		6

// Variables to control our test size
#define ARRAY_SIZE  1024

int main( void )
{
    u_int16_t    in[ARRAY_SIZE];
    float       out1[ARRAY_SIZE], out2[ARRAY_SIZE], out3[ARRAY_SIZE], out4[ ARRAY_SIZE ];
    int         i;
    u_int64_t    startTime;
    double      elapsedTime;
    
    i = InitTimer();
    if( 0 != i )
    {
        printf( "Error iniitializing timer (%i)\n", i );
        return i;
    }
    
    //init the input array
    for( i = 0; i < ARRAY_SIZE; i++ )
        in[i] = rand();

    //Time the compiler version
    startTime = startclock();
    ConvertInt16ToFloat_Compiler( in, out1, ARRAY_SIZE );
    elapsedTime = stopclock( startTime );
    printf( "Compiler Version: %g (seconds)\n", elapsedTime );

    //Time the simple version that tests our method for u_int16_t -> float conversion
    startTime = startclock();
    ConvertInt16ToFloat_Simple( in, out2, ARRAY_SIZE );
    elapsedTime = stopclock( startTime );
    printf( "Simple Version: %g (seconds)\n", elapsedTime );

    //Time the software pipelined version
    startTime = startclock();
    ConvertInt16ToFloat_SoftwarePipelined( in, out3, ARRAY_SIZE );
    elapsedTime = stopclock( startTime );
    printf( "Software Pipelined Version: %g (seconds)\n", elapsedTime );

    //Time the software pipelined version with an empty stage to cover store to load latencies
    startTime = startclock();
    ConvertInt16ToFloat_SoftwarePipelined_EmptyStage( in, out4, ARRAY_SIZE );
    elapsedTime = stopclock( startTime );
    printf( "Software Pipelined Version with extra do-nothing stage: %g (seconds)\n", elapsedTime );


    //Verify the output is the same
    printf( "Testing for correctness...." );
    for( i = 0; i < ARRAY_SIZE; i++ )
    {
        if( out1[i] != out2[i] )
            printf( "Simple version in error at %i: %g, %g\n", i, out1[i], out2[i] );

        if( out1[i] != out3[i] )
            printf( "Software pipelined version in error at %i: %g, %g\n", i, out1[i], out3[i] );

        if( out1[i] != out4[i] )
            printf( "Software pipelined version (with empty stage) in error at %i: %g, %g\n", i, out1[i], out4[i] );
    }
    printf( "done.\n" );
    
    return 0;
}


void ConvertInt16ToFloat_Compiler( u_int16_t *in, float *out, u_int32_t count )
{
    u_int32_t i;
    
    for( i = 0; i < count; i++ )
        out[i] = in[i];
}

void ConvertInt16ToFloat_Simple( u_int16_t *in, float *out, u_int32_t count )
{
    union
    {
            u_int32_t	u;
            float       f;
    }buffer;
    register float      expF;
    register u_int32_t	expI;
    register u_int32_t	stage1_result, stage2_result;
    register float	stage4_result, stage5_result;
    register u_int32_t	*stage3_target = (u_int32_t*) out;
    register float	*stage4_src = out;
    register float 	*stage6_target = out;
    register u_int32_t   i;


    //Set up some constants we will need
    buffer.u = 0x4B000000UL;
    expF = buffer.f;
    expI = buffer.u;

    i = 0;
    while( i < count )
    {
        STAGE_1;
        STAGE_2;
        STAGE_3;
        STAGE_4;
        STAGE_5;
        STAGE_6;
    }
}


void ConvertInt16ToFloat_SoftwarePipelined( u_int16_t *in, float *out, u_int32_t count )
{
    union
    {
            u_int32_t	u;
            float       f;
    }buffer;
    register float      expF;
    register u_int32_t	expI;
    register u_int32_t	stage1_result, stage2_result;
    register float	stage4_result, stage5_result;
    register u_int32_t	*stage3_target = (u_int32_t*) out;
    register float	*stage4_src = out;
    register float 	*stage6_target = out;
    register u_int32_t   i;


    //Set up some constants we will need
    buffer.u = 0x4B000000UL;
    expF = buffer.f;
    expI = buffer.u;

    i = 0;

    if( count >= STAGE_COUNT - 1 )
    {
        //Some of the stages advance pointers in addition to their stated operation. 
        //STAGE_1 increments i in addition to loading in the u_int16_t
        STAGE_1;					
        STAGE_2;	STAGE_1;				
        STAGE_3;	STAGE_2;	STAGE_1;			
        STAGE_4;	STAGE_3;	STAGE_2;	STAGE_1;		
        STAGE_5;	STAGE_4;	STAGE_3;	STAGE_2;	STAGE_1;	
        
        while( i < count )
        {
        STAGE_6;	STAGE_5;	STAGE_4;	STAGE_3;	STAGE_2;	STAGE_1;
        }
        
                        STAGE_6;	STAGE_5;	STAGE_4;	STAGE_3;	STAGE_2;
                                        STAGE_6;	STAGE_5;	STAGE_4;	STAGE_3;
                                                        STAGE_6;	STAGE_5;	STAGE_4;
                                                                        STAGE_6;	STAGE_5;
                                                                                        STAGE_6;
    }

    //Cleanup code for small arrays when count < STAGE_COUNT - 1
    while( i < count )
    {
            STAGE_1;			
            STAGE_2;			
            STAGE_3;			
            STAGE_4;			
            STAGE_5;			
            STAGE_6;			                
    }
}


//For operations that have a very long latency, it is sometimes helpful to add an extra do-nothing
//pipeline stage. In this case we added one between stages 3 and 4. For this particular function it was not helpful. 
void ConvertInt16ToFloat_SoftwarePipelined_EmptyStage( u_int16_t *in, float *out, u_int32_t count )
{
    #define EMPTY_STAGE
    #define NEW_STAGE_COUNT (STAGE_COUNT + 1)

    union
    {
            u_int32_t	u;
            float       f;
    }buffer;
    register float      expF;
    register u_int32_t	expI;
    register u_int32_t	stage1_result, stage2_result;
    register float	stage4_result, stage5_result;
    register u_int32_t	*stage3_target = (u_int32_t*) out;
    register float	*stage4_src = out;
    register float 	*stage6_target = out;
    register u_int32_t   i;


    //Set up some constants we will need
    buffer.u = 0x4B000000UL;
    expF = buffer.f;
    expI = buffer.u;

    i = 0;

    if( count >= NEW_STAGE_COUNT - 1 )
    {
        //Some of the stages advance pointers in addition to their stated operation. 
        //STAGE_1 increments i in addition to loading in the u_int16_t
        STAGE_1;					
        STAGE_2;	STAGE_1;				
        STAGE_3;	STAGE_2;	STAGE_1;
        EMPTY_STAGE;    STAGE_3;        STAGE_2;        STAGE_1;
        STAGE_4;	EMPTY_STAGE;    STAGE_3;	STAGE_2;	STAGE_1;		
        STAGE_5;	STAGE_4;	EMPTY_STAGE;    STAGE_3;	STAGE_2;	STAGE_1;	
        
        while( i < count )
        {
        STAGE_6;	STAGE_5;	STAGE_4;	EMPTY_STAGE;    STAGE_3;	STAGE_2;	STAGE_1;
        }
        
                        STAGE_6;	STAGE_5;	STAGE_4;	EMPTY_STAGE;    STAGE_3;	STAGE_2;
                                        STAGE_6;	STAGE_5;	STAGE_4;	EMPTY_STAGE;    STAGE_3;
                                                        STAGE_6;	STAGE_5;	STAGE_4;        EMPTY_STAGE;    
                                                                        STAGE_6;	STAGE_5;        STAGE_4;       
                                                                                        STAGE_6;        STAGE_5;
                                                                                                        STAGE_6;
    }

    //Cleanup code for small arrays when count < STAGE_COUNT - 1
    while( i < count )
    {
            STAGE_1;			
            STAGE_2;			
            STAGE_3;
            EMPTY_STAGE;			
            STAGE_4;			
            STAGE_5;			
            STAGE_6;			                
    }
}




// A lightweight fast, accurate timer for benchmarking, that requires no special frameworks
//
// Usage:
//
//	u_int64_t	startTime;
//	double		elapsedSeconds;
//
//	startTime = startclock();
//		
//		(Insert your test code here)
//
//	elapsedSeconds = stopclock( startTime );
//
//	The elapsedSeconds will contain the number of seconds that have passed since
//	startclock was called.
//
#include <mach/mach_time.h>

double		conversion;

int InitTimer( void )
{
	mach_timebase_info_data_t	timebase;
	kern_return_t			err;

	memset( &timebase, 0, sizeof( timebase ));

	err = mach_timebase_info( &timebase );
        conversion = 1e-9 * (double) timebase.numer / (double) timebase.denom;
        
        return err;
}

u_int64_t startclock( void )
{
	return mach_absolute_time();
}

double stopclock( u_int64_t startTime )
{
	u_int64_t 			endTime = mach_absolute_time();
	u_int64_t 			difference = endTime - startTime;

        return (double) difference * conversion;
}
