ClockServices.c

/*
 
File:  ClockServices.c
 
Abstract:
    This module contains routines to measure execution time.  It is not needed
    to understand the lessons of the WWDC session.  Additional information is
    in ClockServices.h.
 
Disclaimer: IMPORTANT:  This Apple software is supplied to you by 
Apple Inc. ("Apple") in consideration of your agreement to the
following terms, and your use, installation, modification or
redistribution of this Apple software constitutes acceptance of these
terms.  If you do not agree with these terms, please do not use,
install, modify or redistribute this Apple software.
 
In consideration of your agreement to abide by the following terms, and
subject to these terms, Apple grants you a personal, non-exclusive
license, under Apple's copyrights in this original Apple software (the
"Apple Software"), to use, reproduce, modify and redistribute the Apple
Software, with or without modifications, in source and/or binary forms;
provided that if you redistribute the Apple Software in its entirety and
without modifications, you must retain this notice and the following
text and disclaimers in all such redistributions of the Apple Software. 
Neither the name, trademarks, service marks or logos of Apple Inc. 
may be used to endorse or promote products derived from the Apple
Software without specific prior written permission from Apple.  Except
as expressly stated in this notice, no other rights or licenses, express
or implied, are granted by Apple herein, including but not limited to
any patent rights that may be infringed by your derivative works or by
other works in which the Apple Software may be incorporated.
 
The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
 
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
 
Copyright (C) 2008 Apple Inc. All Rights Reserved.
 
*/
 
 
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
 
#include "ClockServices.h"
 
#define IntelProcessor  (defined __i386__ || defined __x86_64__)
 
 
// Select the timing method to be used.
 
    // Define constants to name the methods.
    #define TM_TB   0   // PowerPC's time-base registers.
    #define TM_TSC  1   // IA-32's time-stamp counter.
    #define TM_PMC  2   // Performance monitor counters.  Works only on G5.
    #define TM_UT   3   // Carbon's UpTime.
    #define TM_TOD  4   // Open Unix's gettimeofday.
    #define TM_MACH 5   // mach_absolute_time.
 
    // Set the method in TimingMethod.
    #if defined __ppc__ || defined __ppc64__
        #define TimingMethod    TM_MACH // Method to use on PowerPC.
    #elif defined __i386__
        #define TimingMethod    TM_MACH // Method to use on IA-32.
    #elif defined __x86_64__
        #define TimingMethod    TM_MACH // Method to use on 64-bit Intel.
    #else
        #define TimingMethod    TM_MACH // Method to use elsewhere.
    #endif
 
    /*  Using the PMCs on earlier processors than the G5's requires changing
        the event number in the first chudSetPMCEvent and using only one PMC to
        count CPU cycles, because there is no event for overflow into another
        PMC.
    */
 
    // Set a flag based on the timing method in use.
    #if TimingMethod == TM_PMC
        // To use the performance monitor counters, we need CHUD facilities.
        #define NeedCHUD
    #endif
 
 
#if TimingMethod == TM_UT
 
    #include <Carbon/Carbon.h>  // For Uptime.
 
#elif TimingMethod == TM_TOD
 
    #include <sys/time.h>       // For gettimeofday.
 
#elif TimingMethod == TM_MACH
 
    #include <mach/mach_time.h>     // For mach_absolute_time.
 
#endif
 
 
/*  Define ClockValue type.  Four things are defined:
 
        ClockValue is a type name.
 
        ClockMax is a value that can be used when initializing a ClockValue
        that acts as the maximum value a ClockValue can have.
 
        upper and lower are member names used to access parts of a ClockValue.
*/
#if TimingMethod == TM_UT
 
    typedef AbsoluteTime ClockValue;
    #define ClockMax    { UINT32_MAX, UINT32_MAX }
    #define upper       hi
    #define lower       lo
 
#elif TimingMethod == TM_TOD
 
    typedef struct timeval ClockValue;
    #define ClockMax    { LONG_MAX, INT_MAX };
        // (Relies on knowledge of timeval definition, unfortunately.)
    #define upper       tv_sec
    #define lower       tv_usec
 
#elif TimingMethod == TM_MACH
 
    typedef uint64_t ClockValue;
    #define ClockMax    UINT64_MAX
 
#else
 
    typedef struct { uint32_t upper, lower; } ClockValue;
    #define ClockMax    { UINT32_MAX, UINT32_MAX }
 
#endif  // TimingMethod.
 
 
// Return the current value of the clock.
static inline ClockValue ReadClock(void)
{
    #if TimingMethod == TM_UT
 
        // Use Carbon's UpTime.
        return UpTime();
 
    #elif TimingMethod == TM_TSC
 
        // Read two values simultaneously from time stamp counter.
        volatile ClockValue result;
        __asm__ volatile("rdtsc" : "=d" (result.upper), "=a" (result.lower));
        return result;
 
    #elif TimingMethod == TM_TOD
 
        // Use Open Unix's gettimeofday.
        ClockValue result;
        gettimeofday(&result, NULL);
        return result;
 
    #elif TimingMethod == TM_MACH
 
        return mach_absolute_time();
 
    #else   // TimingMethod.
 
        /*  In these cases, we must read the time as two values from some
            source, but we cannot read them simultaneously, so the low value
            might roll over while we are reading them.  We will read the upper
            value twice and choose a correct one afterward -- see below.
        */
        ClockValue result;
        uint32_t upper0, upper1;
 
        /*  Get values from the upper and lower count or time-base registers.
 
            The lower register might overflow into the upper register while we
            are doing this, so we will read the upper register twice.  We will
            assume the overflow can occur only once during these three
            instructions, so one of the two values of the upper register is
            correct.  If the lower value is small, it will not overflow for a
            while, so the upper value we read immediately after it must be
            correct.  Conversely, if the lower value is large, it must not have
            overflowed recently, so the upper value we read immediately before
            it must be correct.
        */
        #if TimingMethod == TM_PMC
 
            // Read values from performance monitor counters.
            __asm__ volatile("\
                    mfspr   %[upper0], 772  \n\
                    mfspr   %[lower] , 771  \n\
                    mfspr   %[upper1], 772  "
                :   [lower]  "=r" (result.lower),   // result.lower is output.
                    [upper0] "=r" (upper0),         // upper0 is output.
                    [upper1] "=r" (upper1)          // upper1 is output.
            );
 
        #elif TimingMethod == TM_TB
 
            // Read values from time-base registers.
            __asm__ volatile("\
                    mfspr   %[upper0], 269  \n\
                    mfspr   %[lower] , 268  \n\
                    mfspr   %[upper1], 269  "
                :   [lower]  "=r" (result.lower),   // result.lower is output.
                    [upper0] "=r" (upper0),         // upper0 is output.
                    [upper1] "=r" (upper1)          // upper1 is output.
            );
 
        #else   // TimingMethod.
 
            #error "Code is not defined for selected timing method."
 
        #endif  // TimingMethod.
 
        /*  Choose which upper value to use.  We could do this with a
            conditional expression:
 
                result.upper = result.lower < 2147483648u
                    ? upper1
                    : upper0;
 
            However, the execution time might change depending on whether the
            branch were correctly predicted or not.  Instead, we will use a
            calculation with no branches.
        */
 
        // Use a signed shift to copy lower's high bit to all 32 bits.
        uint32_t mask = (int32_t) result.lower >> 31;
 
        result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
            /*  If mask is all zeroes, the above statement reduces:
 
                    result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
                    result.upper = upper1 ^ ((upper0 ^ upper1) & 0);
                    result.upper = upper1 ^ (0);
                    result.upper = upper1;
 
                If mask is all ones, the above statement reduces:
 
                    result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
                    result.upper = upper1 ^ ((upper0 ^ upper1) & ~0);
                    result.upper = upper1 ^ (upper0 ^ upper1);
                    result.upper = upper0;
            */
 
        return result;
 
    #endif  // TimingMethod.
}
 
 
/* Subtract two clock values, t1 and t0, and return t1-t0.
 
   Since some ClockValue implementations are unsigned, t1 should be not less
   than t0.
*/
static ClockValue SubtractClock(const ClockValue t1, const ClockValue t0)
{
#if TimingMethod == TM_MACH
 
    return t1 - t0;
 
#else // TimingMethod
 
    ClockValue result;
   
    result.upper = t1.upper - t0.upper;
    result.lower = t1.lower - t0.lower;
 
    // If necessary, "borrow" from upper word.
    if (t1.lower < t0.lower)
        --result.upper;
 
    return result;
 
#endif // TimingMethod
}
 
 
/*  Compare two clock values.  Return true iff t0 < t1.
 
    This should only be used to compare relative times, such as durations of
    code execution, which are known to be small relative to the number of bits
    in a ClockValue.  If absolute times are compared, there is no guarantee
    about the value of the upper word.  It might be very high at one time and
    overflow to a very small value at a later time.
*/
static bool ClockLessThan(const ClockValue t0, const ClockValue t1)
{
#if TimingMethod == TM_MACH
 
    return t0 < t1;
 
#else // TimingMethod
 
    if (t0.upper == t1.upper)
        return t0.lower < t1.lower;
    else
        return t0.upper < t1.upper;
 
#endif // TimingMethod
}
 
 
/*  Convert clock value to double, without changing units.  That is, both the
    input and the return value are the same number of clock ticks.
*/
static double ClockToDouble(const ClockValue t)
{
#if TimingMethod == TM_TOD
    return t.upper * 1e6 + t.lower;
#elif TimingMethod == TM_MACH
    return t;
#else
    return t.upper * 4294967296. + t.lower;
#endif
}
 
 
#include <math.h>
#include <sys/sysctl.h> // Declare things needed for sysctlbyname.
#include <mach/mach.h>  // Define constants for CPU types.
 
 
#if defined __i386__    // Conditionalize on architecture.
 
 
    // Describe the layout of the EFLAGS register.
    typedef struct
    {
        unsigned int
            CF      :  1,
                    :  1,
            PF      :  1,
                    :  1,
            AF      :  1,
                    :  1,
            ZF      :  1,
            SF      :  1,
            TF      :  1,
            IF      :  1,
            DF      :  1,
            OF      :  1,
            IOPL    :  2,
            NT      :  1,
                    :  1,
            RF      :  1,
            VM      :  1,
            AC      :  1,
            VIF     :  1,
            VIP     :  1,
            ID      :  1,
                    : 10;
    } EFlags;
 
 
    // Define structure to hold register contents.
    typedef struct 
    {
        uint32_t eax, ebx, ecx, edx;
    } Registers;
 
 
    // Get the EFLAGS register.
    static EFlags GetEFlags(void)
    {
        EFlags flags;
        __asm__ volatile("pushf; popl %[flags]" : [flags] "=rm" (flags));
        return flags;
    }
 
 
    // Set the EFLAGS register.
    static void SetEFlags(EFlags flags)
    {
        __asm__ volatile("pushl %[flags]; popf" : : [flags] "rmi" (flags));
 
    }
 
 
    // Return true if the cpuid instruction is available.
    static bool cpuidIsSupported(void)
    {
        // The cpuid instruction is supported iff we can change the ID flag.
 
        // Record and copy the old flags.
        EFlags old = GetEFlags();
        EFlags new = old;
 
        // Change the ID flag.
        new.ID ^= 1;
        SetEFlags(new);
 
        // Get the altered flags and restore the old flags.
        EFlags newer = GetEFlags();
        SetEFlags(old);
 
        // Test whether the flag changed.
        return new.ID == newer.ID;
    }
 
 
    /*  Execute cpuid instruction with Ieax input and return results in
        structure.
    */
    static Registers cpuid(uint32_t Ieax)
    {
        Registers result;
        __asm__ volatile("\
                pushl   %%eax               \n\
                pushl   %%ebx               \n\
                pushl   %%ecx               \n\
                pushl   %%edx               \n\
                movl    %[Ieax], %%eax      \n\
                cpuid                       \n\
                movl    %%eax, %[eax]       \n\
                movl    %%ebx, %[ebx]       \n\
                movl    %%ecx, %[ecx]       \n\
                movl    %%edx, %[edx]       \n\
                popl    %%edx               \n\
                popl    %%ecx               \n\
                popl    %%ebx               \n\
                popl    %%eax"
            :
                [eax] "=m" (result.eax),
                [ebx] "=m" (result.ebx),
                [ecx] "=m" (result.ecx),
                [edx] "=m" (result.edx)
            :
                [Ieax] "m" (Ieax)
            );
        return result;
    }
 
 
    /*  Compose the family and model identification from cpuid information,
        returning them in the supplied objects.  Return 0 for success and
        non-zero for failure.
    */
    static int GetCPUFamilyAndModel(unsigned int *family,
        unsigned int *model)
    {
        // Fail if cpuid instruction is not supported.
        if (!cpuidIsSupported())
            return 1;
 
        // Get maximum input to cpuid instruction.
        Registers registers = cpuid(0);
        uint32_t maximum = registers.eax;
 
        // Fail if input 1 to cpuid instruction is not supported.
        if (maximum < 1)
            return 1;
 
        // Get results containing family and model information.
        registers = cpuid(1);
 
        // Declare type to access family and model information.
        union
        {
            uint32_t u;
            struct
            {
                unsigned int SteppingID     : 4;
                unsigned int Model          : 4;
                unsigned int Family         : 4;
                unsigned int Type           : 2;
                unsigned int filler14       : 2;
                unsigned int ExtendedModel  : 4;
                unsigned int ExtendedFamily : 8;
                unsigned int filler28       : 4;
            } s;
        } Version = { registers.eax };
 
        // Compose the family ID from the Family and ExtendedFamily fields.
        *family = Version.s.Family;
        if (Version.s.Family == 0x15)
            *family += Version.s.ExtendedFamily;
 
        // Compose the model ID from the Model and ExtendedModel fields.
        *model = Version.s.Model;
        if (Version.s.Family == 0x6 || Version.s.Family == 0xf)
            *model += (unsigned int) Version.s.ExtendedModel << 4;
 
        // Return success.
        return 0;
    }
 
 
#endif  // defined __i386__ // Conditionalize on architecture.
 
 
/*  Return the number of CPU cycles per iteration that the following
    routine, ConsumeCPUCycles, consumes.  Zero is returned if the
    information is not known.
*/
static unsigned int GetCPUCyclesCycles(void)
{
    #if defined __i386__    // Conditionalize on architecture.
        unsigned int family, model;
        if (GetCPUFamilyAndModel(&family, &model))
        {
            fprintf(stderr,
"Warning, unknown CPU, does not support cpuid instruction.\n");
            return 0;
        }
 
        switch (family)
        {
            case 6: switch (model)
                {
                    case 14:    return 33;  // Core.
                    case 15:    return 22;  // Core 2.
                }
                break;
        }
        fprintf(stderr, "Warning, unknown CPU, family %u, model %u.\n",
            family, model);
        return 0;
    #else
        // Ask system for CPU identification.
        unsigned int CPUSubtype;
        size_t SizeOfCPUSubtype = sizeof CPUSubtype;
        if (0 == sysctlbyname("hw.cpusubtype",
                &CPUSubtype, &SizeOfCPUSubtype, NULL, 0))
            // Set number of cycles per iteration according to CPU model.
            switch (CPUSubtype)
            {
                #if defined __ppc__ || defined __ppc64__
                                        // Conditionalize on architecture.
                    case CPU_SUBTYPE_POWERPC_750:
                        return 32;
                    case CPU_SUBTYPE_POWERPC_7450:
                        return 65;
                    case CPU_SUBTYPE_POWERPC_970:
                    case 10:
                        return 16;
                #elif defined __x86_64__
                    case 4:
                        return 22;      // Core 2 in 64-bit mode.
                #endif                  // Conditionalize on architecture.
                    default:
                        fprintf(stderr,
                            "Warning, unknown CPU, subtype %d.\n",
                            CPUSubtype);
                        return 0;
            }
        else
        {
            fprintf(stderr,
"Warning, sysctlbyname(\"hw.cpusubtype\") failed with errno = %d.\n",
                errno);
            return 0;
        }
    #endif  // defined __i386__ // Conditionalize on architecture.
    return 0;
}
 
 
/*  ConsumeCPUCycles keeps the CPU busy for a while.  The intent is to
    execute a loop that consumes a known number of CPU cycles, so the
    time taken can be compared to time reported by other time-measuring
    facilities, such as the time-base registers.
 
    For example, something like the following might happen if the time-base
    registers are being used:
 
        Call this routine with 1 for iterations so that all instructions
        and data are loaded into cache.
        
        Call this routine with 2 for iterations and record the time-base
        registers before and after that call.  Let m0 be the difference.
 
        Call this routine with 1002 for iterations and record the time-base
        registers before and after that call.  Let m1 be the difference.
 
        m1-m0 is the number of ticks of the time base registers that 1000
        iterations takes.  Suppose, for example only, this is 296.
 
        This routine returns a value in *CPUCyclesPerIteration.  1000 times
        that value is the number of CPU cycles that 1000 iterations takes.
        Suppose this is 16,000.
 
        In this example, the measurements would show there are about 54
        (16,000 / 296) CPU cycles per tick of the time-base registers.
 
    Input:
 
        unsigned int iterations
            The number of iterations to execute.
 
        unsigned int *CPUCyclesPerIteration
            Address of one unsigned int to be used for output.
 
    Output:
 
        None.
 
    When measuring, this routine should be called three times:
 
        Once with a small number of iterations to load all instructions
        and data into cache.  (One iteration might be sufficient.)
 
        Once with a small number of iterations to measure overhead.  (Two
        iterations is might be a good choice to allow for the first time
        the loop branches back being different from subsequent times.)
 
        Once with a large number of iterations to measure the loop.
 
    The difference between execution times of the last two calls should be
    used.
 
    Interrupts should be disabled while the three calls are performed.  If
    that is not possible, the sequence of three calls should be restarted
    if an interrupt occurs.
*/
static void ConsumeCPUCycles(unsigned int iterations, void *UnusedPointer)
{
    #if defined __ppc__ || defined __ppc64__    // Architecture.
        /*  Instructions are executed in a loop.  These instructions are
            expected to take a known amount of CPU cycles on specific
            processors, but this should be checked with some other source,
            such as the operating system's report of clock frequency.
        */
        __asm__ volatile(
            "   mtctr   %[iterations]   \n\
                .align  5               \n\
                nop                     \n\
            0:                          \n\
                cror     0,  0,  0      \n\
                cror     1,  1,  1      \n\
                cror     2,  2,  2      \n\
                cror     3,  3,  3      \n\
                cror     4,  4,  4      \n\
                cror     5,  5,  5      \n\
                cror     6,  6,  6      \n\
                cror     7,  7,  7      \n\
                cror     8,  8,  8      \n\
                cror     9,  9,  9      \n\
                cror    10, 10, 10      \n\
                cror    11, 11, 11      \n\
                cror    12, 12, 12      \n\
                cror    13, 13, 13      \n\
                cror    14, 14, 14      \n\
                cror    15, 15, 15      \n\
                bdnz    0b              "
            :                                   // No outputs.
            :   [iterations] "r" (iterations)   // iterations is input.
            :   "ctr"                           // Counter is modified.
        );
    #elif IntelProcessor                        // Architecture.
        __asm__ volatile("                  \n\
                .align  5                   \n\
            0:                              \n\
                decl    %[iterations]       \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                nop                         \n\
                jne     0b                  "
            :   [iterations] "+r" (iterations)
                    // iterations is altered input.
        );
    #endif                                      // Architecture.
 
    return;
}
 
 
#if defined NeedCHUD
 
    #include <CHUD/CHUD.h>
 
 
    /*  CheckCHUDStatus.
 
        If CHUD status is not success, print an error.
    */
    static void CheckCHUDStatus(int status, const char *routine)
    {
        if (status != chudSuccess)
            fprintf(stderr,
                "Error, %s returned %d.\nCHUD status string is \"%s\".\n",
                routine, status, chudGetStatusStr());
    }
 
 
    /*  RequireCHUDStatus.
 
        If CHUD status is not success, print an error and exit.
    */
    static void RequireCHUDStatus(int status, const char *routine)
    {
        CheckCHUDStatus(status, routine);
        if (status != chudSuccess)
            exit(EXIT_FAILURE);
    }
 
 
    /*  StopClockServices.
 
        Release any facilities acquired for clock services.
    */
    static void StopClockServices(void)
    {
        int result = chudStopPMCs();
        CheckCHUDStatus(result, "chudStopPMCs");
 
        result = chudReleaseSamplingFacility(0);
        CheckCHUDStatus(result, "chudReleaseSamplingFacility");
 
        // Make all processors available again.
        result = chudSetNumPhysicalProcessors(chudPhysicalProcessorCount());
        CheckCHUDStatus(result, "chudSetNumPhysicalProcessors");
        result = chudSetNumLogicalProcessors(chudLogicalProcessorCount());
        CheckCHUDStatus(result, "chudSetNumLogicalProcessors");
 
        chudCleanup();
    }
 
 
#endif  // defined NeedCHUD
 
 
// Include things needed by set_time_constraint_policy routine below.
#include <mach/mach.h>
#include <mach/host_info.h>
#include <mach/mach_error.h>
#include <mach/mach_types.h>
#include <mach/message.h>
#include <mach/mach_syscalls.h>
 
 
/*  Set this thread to "time constraint" policy, that is, real-time priority.
    The hope is this will improve timing by preventing time-sharing threads
    from interrupting.
*/
static kern_return_t set_time_constraint_policy()
{
    thread_time_constraint_policy_data_t info;
    mach_msg_type_number_t count = THREAD_TIME_CONSTRAINT_POLICY_COUNT;
    boolean_t get_default = TRUE;
 
    kern_return_t result = thread_policy_get(mach_thread_self(),
        THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &info, &count,
        &get_default);
    if (result != KERN_SUCCESS)
        return result;
 
    return thread_policy_set(mach_thread_self(), THREAD_TIME_CONSTRAINT_POLICY,
        (thread_policy_t) &info, THREAD_TIME_CONSTRAINT_POLICY_COUNT);
}
 
 
/*  StartClockServices.
 
    Initialize the environment for timing routines.  If resources are acquired
    that should be free, an exit handler is set to release them.  (In
    particular, if we set the system to use a single CPU while timing, we want
    to reset it afterward to use all CPUs.)
*/
static void StartClockServices(void)
{
    // Execute this routine only once.
    static bool started = false;
    if (started)
        return;
    started = true;
 
    // Set this thread to real-time scheduling.
    if (KERN_SUCCESS != set_time_constraint_policy())
        fprintf(stderr,
            "Warning, unable to set thread to real-time scheduling.\n");
 
#if defined NeedCHUD
 
        // Initialize CHUD facilities.
 
        int result = chudInitialize();
        RequireCHUDStatus(result, "chudInitialize");
 
        // Restrict system to one processor.
        // (What we really want is to bind this process to one processor.)
        result = chudSetNumPhysicalProcessors(1);
        CheckCHUDStatus(result, "chudSetNumPhysicalProcessors");
        result = chudSetNumLogicalProcessors(1);
        CheckCHUDStatus(result, "chudSetNumLogicalProcessors");
 
        result = chudAcquireSamplingFacility(CHUD_NONBLOCKING);
        RequireCHUDStatus(result, "chudAcquireSamplingFacility");
 
        // On a 970 CPU, PMC1 event 15 is CPU cycles.
        result = chudSetPMCEvent(chudCPU1Dev, PMC_1, 15);
        RequireCHUDStatus(result, "chudSetPMCEvent");
 
        result = chudSetPMCMode(chudCPU1Dev, PMC_1, chudCounter);
        RequireCHUDStatus(result, "chudSetPMCMode");
 
        // On a 970 CPU, PMC2 event 10 is PMC1 overflow.
        result = chudSetPMCEvent(chudCPU1Dev, PMC_2, 10);
        RequireCHUDStatus(result, "chudSetPMCEvent");
 
        result = chudSetPMCMode(chudCPU1Dev, PMC_2, chudCounter);
        RequireCHUDStatus(result, "chudSetPMCMode");
 
        result = chudClearPMCs();
        RequireCHUDStatus(result, "chudClearPMCs");
 
        result = chudStartPMCs();
        RequireCHUDStatus(result, "chudStartPMCs");
 
        // Set exit handler to release resources.
        result = atexit(StopClockServices);
        if (result != 0)
        {
            fprintf(stderr, "Error, atexit returned %d.\n", result);
            StopClockServices();
            exit(EXIT_FAILURE);
        }
 
#endif  // defined NeedCHUD
}
 
 
/*  Define IgnoreInterrupts to ignore whether interrupts occur.  Omit it to
    test for interrupts and rerun a timing if an interrupt occurred during it.
*/
#define IgnoreInterrupts
 
 
#if !defined IgnoreInterrupts
    /*  We are using lwarx and stwcx. instructions below, and they maintain a
        reservation address with a resolution of 128-byte blocks of memory on a
        PowerPC 970, and no larger on other processors we are currently
        interested in.  It is recommended that block be used only for locks and
        protected data.  We might not need to do that for our purposes
        (detecting context switches), but it is safest to keep the block from
        being used for anything else.
 
        An easy way to get a word in its own block of 128 bytes is to allocate
        almost twice as much and use the word in the middle.  Non-standard
        compiler features might do this with less wasted space, but this is
        just a timing program, so space is not critical.
    */
    static unsigned int ReserveBlock[2*128 / sizeof(unsigned int) - 1];
    static unsigned int * const ReservedWord =
        &ReserveBlock[128 / sizeof *ReserveBlock - 1];
#endif
 
 
// Execute a lwarx instruction on ReservedWord.
static inline void LoadAndReserve(void)
{
    #if !defined IgnoreInterrupts
        __asm__ volatile(
            "   lwarx   r0, 0, %[ReservedWord]      "
            :                                       // No output.
            :   [ReservedWord] "r" (ReservedWord)   // ReservedWord is input.
            :   "r0"                                // r0 is modified.
        );
    #endif
}
 
 
/*  Execute a stwcx. instruction on ReservedWord and return the bit it sets in
    the condition register.
*/
static inline unsigned int StoreConditional(void)
{
    #if !defined IgnoreInterrupts
        uint32_t result;
 
        __asm__ volatile(
            "   stwcx.  r0, 0, %[ReservedWord]      \n\
                mfcr    %[result]                   "
            :   [result] "=r" (result)              // result is output.
            :   [ReservedWord] "r" (ReservedWord)   // ReservedWord is input.
        );
 
        return result & 0x20000000;
    #else   // !defined IgnoreInterrupts
        return 1;
    #endif  // !defined IgnoreInterrupts
}
 
 
/*  This routine measures the gross amount of time it takes to execute an
    arbitrary routine, including whatever overhead there is, such as reading
    the clock value and loading the routine into cache.  The number of clock
    ticks execution takes is returned.
 
    Input:
 
        RoutineToBeTimedType routine.
            Address of a routine to measure.  (See typedef of this type.)
 
        unsigned int iterations.
            Number of iterations to tell the routine to perform.
 
        void *data.
            Pointer to be passed to the routine.
 
    Output:
 
        Return value.
            The number of clock ticks execution takes.
*/
static ClockValue MeasureGrossTime(
        RoutineToBeTimedType routine,
        unsigned int iterations,
        void *data
    )
{
    ClockValue t0, t1;
 
    // Get time before executing routine.
    t0 = ReadClock();
 
    // Execute routine.
    routine(iterations, data);
 
    // Get time after executing routine.
    t1 = ReadClock();
 
    // Return difference in times.
    return SubtractClock(t1, t0);
}
 
 
/*  MeasureNetTime.
    
    This routine measures the net amount of time it takes to execute multiple
    iterations in an arbitrary routine, excluding overhead such as reading the
    clock value and loading the routine into cache.  The number of clock ticks
    taken is returned.
 
    Input:
 
        RoutineToBeTimedType routine.
            Address of a routine to measure.  (See typedef of this type.)
 
        unsigned int iterations.
            Number of iterations to tell the routine to perform.
 
        void *data.
            Pointer to be passed to the routine.
 
        samples.
            Number of samples to take.  Some statistical analysis might be
            performed to try to divine the true execution time, filtering out
            noise from interrupts and other sources.  For now, we'll just take
            the minimum of all the samples.
 
    Output:
 
        Return value.
            Number of clock ticks to execute the specified number of
            iterations.
*/
static ClockValue MeasureNetTime(
        RoutineToBeTimedType routine,
        unsigned int iterations,
        void *data,
        unsigned int samples
    )
{
    StartClockServices();
 
    /*  Step 0:  Load cache with a single iteration of the routine.
    */
    MeasureGrossTime(routine, 1, data);
 
 
    /*  Step 1:  Record the minimum time in <samples> samples to execute two
        iterations.
    */
 
    // Initialize the minimum execution time seen so far.
    ClockValue m0 = ClockMax;
 
    // Collect samples.
    for (unsigned int i = 0; i < samples; ++i)
    {
        ClockValue d0;
 
        /*  Repeat measurement until it is completed without an intervening
            process context switch.
        */
        do
        {
            /*  Set a reservation, which the operating system will clear if it
                switches processes.
            */
            LoadAndReserve();
 
            // Measure time for routine overhead.
            d0 = MeasureGrossTime(routine, 2, data);
 
        // Repeat the loop if the reservation was lost.
        } while (0 == StoreConditional());
 
        // Keep track of the minimum time seen so far.
        if (ClockLessThan(d0, m0))
             m0 = d0;
    }
 
 
    /*  Step 2:  Record the minimum time in <samples> samples to execute
        2+<iterations> iterations.
    */
 
    // Initialize the minimum execution time seen so far.
    ClockValue m1 = ClockMax;
 
    for (unsigned int i = 0; i < samples; ++i)
    {
        ClockValue d1;
 
        /*  Repeat measurement until it is completed without an intervening
            process context switch.
        */
        do
        {
            /*  Set a reservation, which the operating system will clear if it
                switches processes.
            */
            LoadAndReserve();
 
            // Measure time for routine overhead and loop execution.
            d1 = MeasureGrossTime(routine, 2+iterations, data);
 
        // Repeat the loop if the reservation was lost.
        } while (0 == StoreConditional());
 
        // Keep track of the minimum time seen so far.
        if (ClockLessThan(d1, m1))
             m1 = d1;
    }
 
 
    /*  Step 3:  Calculate the time to execute <iterations> iterations as the
        difference between the time to execute 2 iterations and the time to
        execute 2+<iterations> iterations.
    */
    return SubtractClock(m1, m0);
}
 
 
/*  Return the number of CPU cycles in one clock tick determined by hardware
    means, such as executing code with a known execution time.
*/
static double CPUCyclesPerTickViaHardware(void)
{
    /*  Get the number of CPU cycles one iteration of the loop in
        ConsumeCPUCycles takes.
    */
    unsigned int Cycles = GetCPUCyclesCycles();
 
    // Give up if we do not know how long the loop takes on this CPU.
    if (Cycles == 0)
        return 0;
 
    static const int Iterations = 5400;
 
    // Get the number of clock ticks Iterations iterations of the loop take.
    ClockValue Ticks = MeasureNetTime(ConsumeCPUCycles, Iterations, NULL, 1000);
 
    // Return cycles per tick.
    return Cycles * Iterations / ClockToDouble(Ticks);
}
 
 
/*  Return the number of CPU cycles in one clock tick determined by asking
    the operating system.
*/
static double CPUCyclesPerTickViaSystem(void)
{
    #if TimingMethod == TM_TB || TimingMethod == TM_UT
 
        /*  The time-base registers (which UpTime uses) should tick at
            hw.tbfrequency, and the CPU should tick at hw.cpufrequency, so the
            CPU cycles per time-base tick should be hw.cpufrequency /
            hw.tbfrequency.
        */
        uint64_t CPUFrequency, TBFrequency;
        size_t SizeOf = sizeof CPUFrequency;
        if (0 != sysctlbyname("hw.cpufrequency",
                &CPUFrequency, &SizeOf, NULL, 0))
        {
            fprintf(stderr,
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n");
            return 0;
        }
        if (0 != sysctlbyname("hw.tbfrequency",
                &TBFrequency, &SizeOf, NULL, 0))
        {
            fprintf(stderr,
"Warning, unable to get hw.tbfrequency from sysctlbyname.\n");
            return 0;
        }
        return (double) CPUFrequency / TBFrequency;
 
    #elif TimingMethod == TM_PMC
 
        /*  The PowerPC performance monitor counter event we use should be one
            CPU cycle per tick.
        */
        return 1;
 
    #elif TimingMethod == TM_TOD
 
        /*  The CPU should tick at hw.cpufrequency, so the CPU cycles per
            microsecond tick should be hw.cpufrequency / 1e6.
        */
        uint64_t CPUFrequency;
        size_t SizeOf = sizeof CPUFrequency;
        if (0 != sysctlbyname("hw.cpufrequency",
                &CPUFrequency, &SizeOf, NULL, 0))
        {
            fprintf(stderr,
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n");
            return 0;
        }
 
        return CPUFrequency / 1e6;
 
    #elif TimingMethod == TM_MACH
 
        /*  Get ratio of mach_absolute_time ticks to nanoseconds.
            (One nanosecond is numer/denom times one tick.)
        */
        mach_timebase_info_data_t data;
        mach_timebase_info(&data);
 
        // Get what system thinks CPU frequency is.
        uint64_t CPUFrequency;
        size_t SizeOf = sizeof CPUFrequency;
        if (0 != sysctlbyname("hw.cpufrequency",
                &CPUFrequency, &SizeOf, NULL, 0))
        {
            fprintf(stderr,
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n");
            return 0;
        }
        
        /*  Calculate expected value as number of CPU cycles in a second
            (CPUFrequency) divided by number of mach_absolute_time ticks
            in a billion nanoseconds.
        */
        return CPUFrequency / (1e9 * data.denom / data.numer);
 
    #else   // TimingMethod.
 
        #error "Code is not defined for selected timing method."
 
        /*  TM_TSC is not supported.  If we want to check the processor
            model, we could return 1 for:
 
                family 6, models  9 and 13;
                family 15, models 0, 1, and 2;
                P6 family processors.
 
            For other models, we have no way to convert the time stamp counter
            to cycles, because, according to Intel's specification, the
            increment rate "may be set" by the maximum core-clock to bus-clock
            ratio of the processor or by the frequency at which the processor
            is booted.  Additional examination of the documentation for each
            processor model would be needed, and it would be impossible to
            support unexamined processors.
 
            We could return 0 and let the hardware CPUCyclesPerTickViaHardware
            routine control the value used, but it also cannot support
            unexamined processors.
        */
 
    #endif  // TimingMethod.
}
 
 
// Return the number of CPU cycles in one clock tick.
static double CPUCyclesPerTick(void)
{
    /*  Cache the number in a static object and return it if we have
        computed it previously.
    */
    static double CachedCPUCyclesPerTick = 0;
 
    if (CachedCPUCyclesPerTick != 0)
        return CachedCPUCyclesPerTick;
 
    double CyclesPerTickViaHardware = CPUCyclesPerTickViaHardware();
    double CyclesPerTickViaSystem   = CPUCyclesPerTickViaSystem  ();
 
    if (CyclesPerTickViaHardware == 0)
    {
        // This must be a CPU model we are unfamiliar with.
 
        if (CyclesPerTickViaSystem == 0)
        {
            fprintf(stderr, "Error, no value is available for the number of CPU cycles per clock tick.\n");
            exit(EXIT_FAILURE);
        }
        else
            CachedCPUCyclesPerTick = CyclesPerTickViaSystem;
    }
    else
        if (CyclesPerTickViaSystem == 0)
            CachedCPUCyclesPerTick = CyclesPerTickViaHardware;
        else
        {
            const double Tolerance = .004;  // Maximum relative error allowed.
 
            // Compare observed and expected ratios.
            if (Tolerance <
                    fabs(CyclesPerTickViaSystem / CyclesPerTickViaHardware - 1))
                fprintf(stderr,
"Warning, hardware and operating system disagree about the number of CPU\n"
"cycles per clock tick.\n"
"\tHardware indicates %g cycles per tick.\n"
"\tSystem indicates %g cycles per tick.\n"
"\tUsing system value.\n",
                CyclesPerTickViaHardware,
                CyclesPerTickViaSystem);
 
            CachedCPUCyclesPerTick = CyclesPerTickViaSystem;
        }
 
    return CachedCPUCyclesPerTick;
}
 
 
/*  ClockToCPUCycles.
 
    Convert time (in a ClockValue object) to number of CPU cycles in that time.
*/
double ClockToCPUCycles(ClockValue t)
{
    return ClockToDouble(t) * CPUCyclesPerTick();
}
 
 
#include <time.h>
 
 
// See header file for description of this routine.
double MeasureNetTimeInCPUCycles(
        RoutineToBeTimedType routine,
        unsigned int iterations,
        void *data,
        unsigned int samples
    )
{
    return ClockToCPUCycles(MeasureNetTime(routine, iterations, data, samples))
        / iterations;
}