ClockServices.c
/* |
File: ClockServices.c |
Abstract: |
This module contains routines to measure execution time. It is not needed |
to understand the lessons of the WWDC session. Additional information is |
in ClockServices.h. |
Disclaimer: IMPORTANT: This Apple software is supplied to you by |
Apple Inc. ("Apple") in consideration of your agreement to the |
following terms, and your use, installation, modification or |
redistribution of this Apple software constitutes acceptance of these |
terms. If you do not agree with these terms, please do not use, |
install, modify or redistribute this Apple software. |
In consideration of your agreement to abide by the following terms, and |
subject to these terms, Apple grants you a personal, non-exclusive |
license, under Apple's copyrights in this original Apple software (the |
"Apple Software"), to use, reproduce, modify and redistribute the Apple |
Software, with or without modifications, in source and/or binary forms; |
provided that if you redistribute the Apple Software in its entirety and |
without modifications, you must retain this notice and the following |
text and disclaimers in all such redistributions of the Apple Software. |
Neither the name, trademarks, service marks or logos of Apple Inc. |
may be used to endorse or promote products derived from the Apple |
Software without specific prior written permission from Apple. Except |
as expressly stated in this notice, no other rights or licenses, express |
or implied, are granted by Apple herein, including but not limited to |
any patent rights that may be infringed by your derivative works or by |
other works in which the Apple Software may be incorporated. |
The Apple Software is provided by Apple on an "AS IS" basis. APPLE |
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION |
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS |
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND |
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL |
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, |
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED |
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), |
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE |
POSSIBILITY OF SUCH DAMAGE. |
Copyright (C) 2008 Apple Inc. All Rights Reserved. |
*/ |
#include <errno.h> |
#include <stdbool.h> |
#include <stdint.h> |
#include <stdio.h> |
#include <stdlib.h> |
#include "ClockServices.h" |
#define IntelProcessor (defined __i386__ || defined __x86_64__) |
// Select the timing method to be used. |
// Define constants to name the methods. |
#define TM_TB 0 // PowerPC's time-base registers. |
#define TM_TSC 1 // IA-32's time-stamp counter. |
#define TM_PMC 2 // Performance monitor counters. Works only on G5. |
#define TM_UT 3 // Carbon's UpTime. |
#define TM_TOD 4 // Open Unix's gettimeofday. |
#define TM_MACH 5 // mach_absolute_time. |
// Set the method in TimingMethod. |
#if defined __ppc__ || defined __ppc64__ |
#define TimingMethod TM_MACH // Method to use on PowerPC. |
#elif defined __i386__ |
#define TimingMethod TM_MACH // Method to use on IA-32. |
#elif defined __x86_64__ |
#define TimingMethod TM_MACH // Method to use on 64-bit Intel. |
#else |
#define TimingMethod TM_MACH // Method to use elsewhere. |
#endif |
/* Using the PMCs on earlier processors than the G5's requires changing |
the event number in the first chudSetPMCEvent and using only one PMC to |
count CPU cycles, because there is no event for overflow into another |
PMC. |
*/ |
// Set a flag based on the timing method in use. |
#if TimingMethod == TM_PMC |
// To use the performance monitor counters, we need CHUD facilities. |
#define NeedCHUD |
#endif |
#if TimingMethod == TM_UT |
#include <Carbon/Carbon.h> // For Uptime. |
#elif TimingMethod == TM_TOD |
#include <sys/time.h> // For gettimeofday. |
#elif TimingMethod == TM_MACH |
#include <mach/mach_time.h> // For mach_absolute_time. |
#endif |
/* Define ClockValue type. Four things are defined: |
ClockValue is a type name. |
ClockMax is a value that can be used when initializing a ClockValue |
that acts as the maximum value a ClockValue can have. |
upper and lower are member names used to access parts of a ClockValue. |
*/ |
#if TimingMethod == TM_UT |
typedef AbsoluteTime ClockValue; |
#define ClockMax { UINT32_MAX, UINT32_MAX } |
#define upper hi |
#define lower lo |
#elif TimingMethod == TM_TOD |
typedef struct timeval ClockValue; |
#define ClockMax { LONG_MAX, INT_MAX }; |
// (Relies on knowledge of timeval definition, unfortunately.) |
#define upper tv_sec |
#define lower tv_usec |
#elif TimingMethod == TM_MACH |
typedef uint64_t ClockValue; |
#define ClockMax UINT64_MAX |
#else |
typedef struct { uint32_t upper, lower; } ClockValue; |
#define ClockMax { UINT32_MAX, UINT32_MAX } |
#endif // TimingMethod. |
// Return the current value of the clock. |
static inline ClockValue ReadClock(void) |
{ |
#if TimingMethod == TM_UT |
// Use Carbon's UpTime. |
return UpTime(); |
#elif TimingMethod == TM_TSC |
// Read two values simultaneously from time stamp counter. |
volatile ClockValue result; |
__asm__ volatile("rdtsc" : "=d" (result.upper), "=a" (result.lower)); |
return result; |
#elif TimingMethod == TM_TOD |
// Use Open Unix's gettimeofday. |
ClockValue result; |
gettimeofday(&result, NULL); |
return result; |
#elif TimingMethod == TM_MACH |
return mach_absolute_time(); |
#else // TimingMethod. |
/* In these cases, we must read the time as two values from some |
source, but we cannot read them simultaneously, so the low value |
might roll over while we are reading them. We will read the upper |
value twice and choose a correct one afterward -- see below. |
*/ |
ClockValue result; |
uint32_t upper0, upper1; |
/* Get values from the upper and lower count or time-base registers. |
The lower register might overflow into the upper register while we |
are doing this, so we will read the upper register twice. We will |
assume the overflow can occur only once during these three |
instructions, so one of the two values of the upper register is |
correct. If the lower value is small, it will not overflow for a |
while, so the upper value we read immediately after it must be |
correct. Conversely, if the lower value is large, it must not have |
overflowed recently, so the upper value we read immediately before |
it must be correct. |
*/ |
#if TimingMethod == TM_PMC |
// Read values from performance monitor counters. |
__asm__ volatile("\ |
mfspr %[upper0], 772 \n\ |
mfspr %[lower] , 771 \n\ |
mfspr %[upper1], 772 " |
: [lower] "=r" (result.lower), // result.lower is output. |
[upper0] "=r" (upper0), // upper0 is output. |
[upper1] "=r" (upper1) // upper1 is output. |
); |
#elif TimingMethod == TM_TB |
// Read values from time-base registers. |
__asm__ volatile("\ |
mfspr %[upper0], 269 \n\ |
mfspr %[lower] , 268 \n\ |
mfspr %[upper1], 269 " |
: [lower] "=r" (result.lower), // result.lower is output. |
[upper0] "=r" (upper0), // upper0 is output. |
[upper1] "=r" (upper1) // upper1 is output. |
); |
#else // TimingMethod. |
#error "Code is not defined for selected timing method." |
#endif // TimingMethod. |
/* Choose which upper value to use. We could do this with a |
conditional expression: |
result.upper = result.lower < 2147483648u |
? upper1 |
: upper0; |
However, the execution time might change depending on whether the |
branch were correctly predicted or not. Instead, we will use a |
calculation with no branches. |
*/ |
// Use a signed shift to copy lower's high bit to all 32 bits. |
uint32_t mask = (int32_t) result.lower >> 31; |
result.upper = upper1 ^ ((upper0 ^ upper1) & mask); |
/* If mask is all zeroes, the above statement reduces: |
result.upper = upper1 ^ ((upper0 ^ upper1) & mask); |
result.upper = upper1 ^ ((upper0 ^ upper1) & 0); |
result.upper = upper1 ^ (0); |
result.upper = upper1; |
If mask is all ones, the above statement reduces: |
result.upper = upper1 ^ ((upper0 ^ upper1) & mask); |
result.upper = upper1 ^ ((upper0 ^ upper1) & ~0); |
result.upper = upper1 ^ (upper0 ^ upper1); |
result.upper = upper0; |
*/ |
return result; |
#endif // TimingMethod. |
} |
/* Subtract two clock values, t1 and t0, and return t1-t0. |
Since some ClockValue implementations are unsigned, t1 should be not less |
than t0. |
*/ |
static ClockValue SubtractClock(const ClockValue t1, const ClockValue t0) |
{ |
#if TimingMethod == TM_MACH |
return t1 - t0; |
#else // TimingMethod |
ClockValue result; |
result.upper = t1.upper - t0.upper; |
result.lower = t1.lower - t0.lower; |
// If necessary, "borrow" from upper word. |
if (t1.lower < t0.lower) |
--result.upper; |
return result; |
#endif // TimingMethod |
} |
/* Compare two clock values. Return true iff t0 < t1. |
This should only be used to compare relative times, such as durations of |
code execution, which are known to be small relative to the number of bits |
in a ClockValue. If absolute times are compared, there is no guarantee |
about the value of the upper word. It might be very high at one time and |
overflow to a very small value at a later time. |
*/ |
static bool ClockLessThan(const ClockValue t0, const ClockValue t1) |
{ |
#if TimingMethod == TM_MACH |
return t0 < t1; |
#else // TimingMethod |
if (t0.upper == t1.upper) |
return t0.lower < t1.lower; |
else |
return t0.upper < t1.upper; |
#endif // TimingMethod |
} |
/* Convert clock value to double, without changing units. That is, both the |
input and the return value are the same number of clock ticks. |
*/ |
static double ClockToDouble(const ClockValue t) |
{ |
#if TimingMethod == TM_TOD |
return t.upper * 1e6 + t.lower; |
#elif TimingMethod == TM_MACH |
return t; |
#else |
return t.upper * 4294967296. + t.lower; |
#endif |
} |
#include <math.h> |
#include <sys/sysctl.h> // Declare things needed for sysctlbyname. |
#include <mach/mach.h> // Define constants for CPU types. |
#if defined __i386__ // Conditionalize on architecture. |
// Describe the layout of the EFLAGS register. |
typedef struct |
{ |
unsigned int |
CF : 1, |
: 1, |
PF : 1, |
: 1, |
AF : 1, |
: 1, |
ZF : 1, |
SF : 1, |
TF : 1, |
IF : 1, |
DF : 1, |
OF : 1, |
IOPL : 2, |
NT : 1, |
: 1, |
RF : 1, |
VM : 1, |
AC : 1, |
VIF : 1, |
VIP : 1, |
ID : 1, |
: 10; |
} EFlags; |
// Define structure to hold register contents. |
typedef struct |
{ |
uint32_t eax, ebx, ecx, edx; |
} Registers; |
// Get the EFLAGS register. |
static EFlags GetEFlags(void) |
{ |
EFlags flags; |
__asm__ volatile("pushf; popl %[flags]" : [flags] "=rm" (flags)); |
return flags; |
} |
// Set the EFLAGS register. |
static void SetEFlags(EFlags flags) |
{ |
__asm__ volatile("pushl %[flags]; popf" : : [flags] "rmi" (flags)); |
} |
// Return true if the cpuid instruction is available. |
static bool cpuidIsSupported(void) |
{ |
// The cpuid instruction is supported iff we can change the ID flag. |
// Record and copy the old flags. |
EFlags old = GetEFlags(); |
EFlags new = old; |
// Change the ID flag. |
new.ID ^= 1; |
SetEFlags(new); |
// Get the altered flags and restore the old flags. |
EFlags newer = GetEFlags(); |
SetEFlags(old); |
// Test whether the flag changed. |
return new.ID == newer.ID; |
} |
/* Execute cpuid instruction with Ieax input and return results in |
structure. |
*/ |
static Registers cpuid(uint32_t Ieax) |
{ |
Registers result; |
__asm__ volatile("\ |
pushl %%eax \n\ |
pushl %%ebx \n\ |
pushl %%ecx \n\ |
pushl %%edx \n\ |
movl %[Ieax], %%eax \n\ |
cpuid \n\ |
movl %%eax, %[eax] \n\ |
movl %%ebx, %[ebx] \n\ |
movl %%ecx, %[ecx] \n\ |
movl %%edx, %[edx] \n\ |
popl %%edx \n\ |
popl %%ecx \n\ |
popl %%ebx \n\ |
popl %%eax" |
: |
[eax] "=m" (result.eax), |
[ebx] "=m" (result.ebx), |
[ecx] "=m" (result.ecx), |
[edx] "=m" (result.edx) |
: |
[Ieax] "m" (Ieax) |
); |
return result; |
} |
/* Compose the family and model identification from cpuid information, |
returning them in the supplied objects. Return 0 for success and |
non-zero for failure. |
*/ |
static int GetCPUFamilyAndModel(unsigned int *family, |
unsigned int *model) |
{ |
// Fail if cpuid instruction is not supported. |
if (!cpuidIsSupported()) |
return 1; |
// Get maximum input to cpuid instruction. |
Registers registers = cpuid(0); |
uint32_t maximum = registers.eax; |
// Fail if input 1 to cpuid instruction is not supported. |
if (maximum < 1) |
return 1; |
// Get results containing family and model information. |
registers = cpuid(1); |
// Declare type to access family and model information. |
union |
{ |
uint32_t u; |
struct |
{ |
unsigned int SteppingID : 4; |
unsigned int Model : 4; |
unsigned int Family : 4; |
unsigned int Type : 2; |
unsigned int filler14 : 2; |
unsigned int ExtendedModel : 4; |
unsigned int ExtendedFamily : 8; |
unsigned int filler28 : 4; |
} s; |
} Version = { registers.eax }; |
// Compose the family ID from the Family and ExtendedFamily fields. |
*family = Version.s.Family; |
if (Version.s.Family == 0x15) |
*family += Version.s.ExtendedFamily; |
// Compose the model ID from the Model and ExtendedModel fields. |
*model = Version.s.Model; |
if (Version.s.Family == 0x6 || Version.s.Family == 0xf) |
*model += (unsigned int) Version.s.ExtendedModel << 4; |
// Return success. |
return 0; |
} |
#endif // defined __i386__ // Conditionalize on architecture. |
/* Return the number of CPU cycles per iteration that the following |
routine, ConsumeCPUCycles, consumes. Zero is returned if the |
information is not known. |
*/ |
static unsigned int GetCPUCyclesCycles(void) |
{ |
#if defined __i386__ // Conditionalize on architecture. |
unsigned int family, model; |
if (GetCPUFamilyAndModel(&family, &model)) |
{ |
fprintf(stderr, |
"Warning, unknown CPU, does not support cpuid instruction.\n"); |
return 0; |
} |
switch (family) |
{ |
case 6: switch (model) |
{ |
case 14: return 33; // Core. |
case 15: return 22; // Core 2. |
} |
break; |
} |
fprintf(stderr, "Warning, unknown CPU, family %u, model %u.\n", |
family, model); |
return 0; |
#else |
// Ask system for CPU identification. |
unsigned int CPUSubtype; |
size_t SizeOfCPUSubtype = sizeof CPUSubtype; |
if (0 == sysctlbyname("hw.cpusubtype", |
&CPUSubtype, &SizeOfCPUSubtype, NULL, 0)) |
// Set number of cycles per iteration according to CPU model. |
switch (CPUSubtype) |
{ |
#if defined __ppc__ || defined __ppc64__ |
// Conditionalize on architecture. |
case CPU_SUBTYPE_POWERPC_750: |
return 32; |
case CPU_SUBTYPE_POWERPC_7450: |
return 65; |
case CPU_SUBTYPE_POWERPC_970: |
case 10: |
return 16; |
#elif defined __x86_64__ |
case 4: |
return 22; // Core 2 in 64-bit mode. |
#endif // Conditionalize on architecture. |
default: |
fprintf(stderr, |
"Warning, unknown CPU, subtype %d.\n", |
CPUSubtype); |
return 0; |
} |
else |
{ |
fprintf(stderr, |
"Warning, sysctlbyname(\"hw.cpusubtype\") failed with errno = %d.\n", |
errno); |
return 0; |
} |
#endif // defined __i386__ // Conditionalize on architecture. |
return 0; |
} |
/* ConsumeCPUCycles keeps the CPU busy for a while. The intent is to |
execute a loop that consumes a known number of CPU cycles, so the |
time taken can be compared to time reported by other time-measuring |
facilities, such as the time-base registers. |
For example, something like the following might happen if the time-base |
registers are being used: |
Call this routine with 1 for iterations so that all instructions |
and data are loaded into cache. |
Call this routine with 2 for iterations and record the time-base |
registers before and after that call. Let m0 be the difference. |
Call this routine with 1002 for iterations and record the time-base |
registers before and after that call. Let m1 be the difference. |
m1-m0 is the number of ticks of the time base registers that 1000 |
iterations takes. Suppose, for example only, this is 296. |
This routine returns a value in *CPUCyclesPerIteration. 1000 times |
that value is the number of CPU cycles that 1000 iterations takes. |
Suppose this is 16,000. |
In this example, the measurements would show there are about 54 |
(16,000 / 296) CPU cycles per tick of the time-base registers. |
Input: |
unsigned int iterations |
The number of iterations to execute. |
unsigned int *CPUCyclesPerIteration |
Address of one unsigned int to be used for output. |
Output: |
None. |
When measuring, this routine should be called three times: |
Once with a small number of iterations to load all instructions |
and data into cache. (One iteration might be sufficient.) |
Once with a small number of iterations to measure overhead. (Two |
iterations is might be a good choice to allow for the first time |
the loop branches back being different from subsequent times.) |
Once with a large number of iterations to measure the loop. |
The difference between execution times of the last two calls should be |
used. |
Interrupts should be disabled while the three calls are performed. If |
that is not possible, the sequence of three calls should be restarted |
if an interrupt occurs. |
*/ |
static void ConsumeCPUCycles(unsigned int iterations, void *UnusedPointer) |
{ |
#if defined __ppc__ || defined __ppc64__ // Architecture. |
/* Instructions are executed in a loop. These instructions are |
expected to take a known amount of CPU cycles on specific |
processors, but this should be checked with some other source, |
such as the operating system's report of clock frequency. |
*/ |
__asm__ volatile( |
" mtctr %[iterations] \n\ |
.align 5 \n\ |
nop \n\ |
0: \n\ |
cror 0, 0, 0 \n\ |
cror 1, 1, 1 \n\ |
cror 2, 2, 2 \n\ |
cror 3, 3, 3 \n\ |
cror 4, 4, 4 \n\ |
cror 5, 5, 5 \n\ |
cror 6, 6, 6 \n\ |
cror 7, 7, 7 \n\ |
cror 8, 8, 8 \n\ |
cror 9, 9, 9 \n\ |
cror 10, 10, 10 \n\ |
cror 11, 11, 11 \n\ |
cror 12, 12, 12 \n\ |
cror 13, 13, 13 \n\ |
cror 14, 14, 14 \n\ |
cror 15, 15, 15 \n\ |
bdnz 0b " |
: // No outputs. |
: [iterations] "r" (iterations) // iterations is input. |
: "ctr" // Counter is modified. |
); |
#elif IntelProcessor // Architecture. |
__asm__ volatile(" \n\ |
.align 5 \n\ |
0: \n\ |
decl %[iterations] \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
nop \n\ |
jne 0b " |
: [iterations] "+r" (iterations) |
// iterations is altered input. |
); |
#endif // Architecture. |
return; |
} |
#if defined NeedCHUD |
#include <CHUD/CHUD.h> |
/* CheckCHUDStatus. |
If CHUD status is not success, print an error. |
*/ |
static void CheckCHUDStatus(int status, const char *routine) |
{ |
if (status != chudSuccess) |
fprintf(stderr, |
"Error, %s returned %d.\nCHUD status string is \"%s\".\n", |
routine, status, chudGetStatusStr()); |
} |
/* RequireCHUDStatus. |
If CHUD status is not success, print an error and exit. |
*/ |
static void RequireCHUDStatus(int status, const char *routine) |
{ |
CheckCHUDStatus(status, routine); |
if (status != chudSuccess) |
exit(EXIT_FAILURE); |
} |
/* StopClockServices. |
Release any facilities acquired for clock services. |
*/ |
static void StopClockServices(void) |
{ |
int result = chudStopPMCs(); |
CheckCHUDStatus(result, "chudStopPMCs"); |
result = chudReleaseSamplingFacility(0); |
CheckCHUDStatus(result, "chudReleaseSamplingFacility"); |
// Make all processors available again. |
result = chudSetNumPhysicalProcessors(chudPhysicalProcessorCount()); |
CheckCHUDStatus(result, "chudSetNumPhysicalProcessors"); |
result = chudSetNumLogicalProcessors(chudLogicalProcessorCount()); |
CheckCHUDStatus(result, "chudSetNumLogicalProcessors"); |
chudCleanup(); |
} |
#endif // defined NeedCHUD |
// Include things needed by set_time_constraint_policy routine below. |
#include <mach/mach.h> |
#include <mach/host_info.h> |
#include <mach/mach_error.h> |
#include <mach/mach_types.h> |
#include <mach/message.h> |
#include <mach/mach_syscalls.h> |
/* Set this thread to "time constraint" policy, that is, real-time priority. |
The hope is this will improve timing by preventing time-sharing threads |
from interrupting. |
*/ |
static kern_return_t set_time_constraint_policy() |
{ |
thread_time_constraint_policy_data_t info; |
mach_msg_type_number_t count = THREAD_TIME_CONSTRAINT_POLICY_COUNT; |
boolean_t get_default = TRUE; |
kern_return_t result = thread_policy_get(mach_thread_self(), |
THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &info, &count, |
&get_default); |
if (result != KERN_SUCCESS) |
return result; |
return thread_policy_set(mach_thread_self(), THREAD_TIME_CONSTRAINT_POLICY, |
(thread_policy_t) &info, THREAD_TIME_CONSTRAINT_POLICY_COUNT); |
} |
/* StartClockServices. |
Initialize the environment for timing routines. If resources are acquired |
that should be free, an exit handler is set to release them. (In |
particular, if we set the system to use a single CPU while timing, we want |
to reset it afterward to use all CPUs.) |
*/ |
static void StartClockServices(void) |
{ |
// Execute this routine only once. |
static bool started = false; |
if (started) |
return; |
started = true; |
// Set this thread to real-time scheduling. |
if (KERN_SUCCESS != set_time_constraint_policy()) |
fprintf(stderr, |
"Warning, unable to set thread to real-time scheduling.\n"); |
#if defined NeedCHUD |
// Initialize CHUD facilities. |
int result = chudInitialize(); |
RequireCHUDStatus(result, "chudInitialize"); |
// Restrict system to one processor. |
// (What we really want is to bind this process to one processor.) |
result = chudSetNumPhysicalProcessors(1); |
CheckCHUDStatus(result, "chudSetNumPhysicalProcessors"); |
result = chudSetNumLogicalProcessors(1); |
CheckCHUDStatus(result, "chudSetNumLogicalProcessors"); |
result = chudAcquireSamplingFacility(CHUD_NONBLOCKING); |
RequireCHUDStatus(result, "chudAcquireSamplingFacility"); |
// On a 970 CPU, PMC1 event 15 is CPU cycles. |
result = chudSetPMCEvent(chudCPU1Dev, PMC_1, 15); |
RequireCHUDStatus(result, "chudSetPMCEvent"); |
result = chudSetPMCMode(chudCPU1Dev, PMC_1, chudCounter); |
RequireCHUDStatus(result, "chudSetPMCMode"); |
// On a 970 CPU, PMC2 event 10 is PMC1 overflow. |
result = chudSetPMCEvent(chudCPU1Dev, PMC_2, 10); |
RequireCHUDStatus(result, "chudSetPMCEvent"); |
result = chudSetPMCMode(chudCPU1Dev, PMC_2, chudCounter); |
RequireCHUDStatus(result, "chudSetPMCMode"); |
result = chudClearPMCs(); |
RequireCHUDStatus(result, "chudClearPMCs"); |
result = chudStartPMCs(); |
RequireCHUDStatus(result, "chudStartPMCs"); |
// Set exit handler to release resources. |
result = atexit(StopClockServices); |
if (result != 0) |
{ |
fprintf(stderr, "Error, atexit returned %d.\n", result); |
StopClockServices(); |
exit(EXIT_FAILURE); |
} |
#endif // defined NeedCHUD |
} |
/* Define IgnoreInterrupts to ignore whether interrupts occur. Omit it to |
test for interrupts and rerun a timing if an interrupt occurred during it. |
*/ |
#define IgnoreInterrupts |
#if !defined IgnoreInterrupts |
/* We are using lwarx and stwcx. instructions below, and they maintain a |
reservation address with a resolution of 128-byte blocks of memory on a |
PowerPC 970, and no larger on other processors we are currently |
interested in. It is recommended that block be used only for locks and |
protected data. We might not need to do that for our purposes |
(detecting context switches), but it is safest to keep the block from |
being used for anything else. |
An easy way to get a word in its own block of 128 bytes is to allocate |
almost twice as much and use the word in the middle. Non-standard |
compiler features might do this with less wasted space, but this is |
just a timing program, so space is not critical. |
*/ |
static unsigned int ReserveBlock[2*128 / sizeof(unsigned int) - 1]; |
static unsigned int * const ReservedWord = |
&ReserveBlock[128 / sizeof *ReserveBlock - 1]; |
#endif |
// Execute a lwarx instruction on ReservedWord. |
static inline void LoadAndReserve(void) |
{ |
#if !defined IgnoreInterrupts |
__asm__ volatile( |
" lwarx r0, 0, %[ReservedWord] " |
: // No output. |
: [ReservedWord] "r" (ReservedWord) // ReservedWord is input. |
: "r0" // r0 is modified. |
); |
#endif |
} |
/* Execute a stwcx. instruction on ReservedWord and return the bit it sets in |
the condition register. |
*/ |
static inline unsigned int StoreConditional(void) |
{ |
#if !defined IgnoreInterrupts |
uint32_t result; |
__asm__ volatile( |
" stwcx. r0, 0, %[ReservedWord] \n\ |
mfcr %[result] " |
: [result] "=r" (result) // result is output. |
: [ReservedWord] "r" (ReservedWord) // ReservedWord is input. |
); |
return result & 0x20000000; |
#else // !defined IgnoreInterrupts |
return 1; |
#endif // !defined IgnoreInterrupts |
} |
/* This routine measures the gross amount of time it takes to execute an |
arbitrary routine, including whatever overhead there is, such as reading |
the clock value and loading the routine into cache. The number of clock |
ticks execution takes is returned. |
Input: |
RoutineToBeTimedType routine. |
Address of a routine to measure. (See typedef of this type.) |
unsigned int iterations. |
Number of iterations to tell the routine to perform. |
void *data. |
Pointer to be passed to the routine. |
Output: |
Return value. |
The number of clock ticks execution takes. |
*/ |
static ClockValue MeasureGrossTime( |
RoutineToBeTimedType routine, |
unsigned int iterations, |
void *data |
) |
{ |
ClockValue t0, t1; |
// Get time before executing routine. |
t0 = ReadClock(); |
// Execute routine. |
routine(iterations, data); |
// Get time after executing routine. |
t1 = ReadClock(); |
// Return difference in times. |
return SubtractClock(t1, t0); |
} |
/* MeasureNetTime. |
This routine measures the net amount of time it takes to execute multiple |
iterations in an arbitrary routine, excluding overhead such as reading the |
clock value and loading the routine into cache. The number of clock ticks |
taken is returned. |
Input: |
RoutineToBeTimedType routine. |
Address of a routine to measure. (See typedef of this type.) |
unsigned int iterations. |
Number of iterations to tell the routine to perform. |
void *data. |
Pointer to be passed to the routine. |
samples. |
Number of samples to take. Some statistical analysis might be |
performed to try to divine the true execution time, filtering out |
noise from interrupts and other sources. For now, we'll just take |
the minimum of all the samples. |
Output: |
Return value. |
Number of clock ticks to execute the specified number of |
iterations. |
*/ |
static ClockValue MeasureNetTime( |
RoutineToBeTimedType routine, |
unsigned int iterations, |
void *data, |
unsigned int samples |
) |
{ |
StartClockServices(); |
/* Step 0: Load cache with a single iteration of the routine. |
*/ |
MeasureGrossTime(routine, 1, data); |
/* Step 1: Record the minimum time in <samples> samples to execute two |
iterations. |
*/ |
// Initialize the minimum execution time seen so far. |
ClockValue m0 = ClockMax; |
// Collect samples. |
for (unsigned int i = 0; i < samples; ++i) |
{ |
ClockValue d0; |
/* Repeat measurement until it is completed without an intervening |
process context switch. |
*/ |
do |
{ |
/* Set a reservation, which the operating system will clear if it |
switches processes. |
*/ |
LoadAndReserve(); |
// Measure time for routine overhead. |
d0 = MeasureGrossTime(routine, 2, data); |
// Repeat the loop if the reservation was lost. |
} while (0 == StoreConditional()); |
// Keep track of the minimum time seen so far. |
if (ClockLessThan(d0, m0)) |
m0 = d0; |
} |
/* Step 2: Record the minimum time in <samples> samples to execute |
2+<iterations> iterations. |
*/ |
// Initialize the minimum execution time seen so far. |
ClockValue m1 = ClockMax; |
for (unsigned int i = 0; i < samples; ++i) |
{ |
ClockValue d1; |
/* Repeat measurement until it is completed without an intervening |
process context switch. |
*/ |
do |
{ |
/* Set a reservation, which the operating system will clear if it |
switches processes. |
*/ |
LoadAndReserve(); |
// Measure time for routine overhead and loop execution. |
d1 = MeasureGrossTime(routine, 2+iterations, data); |
// Repeat the loop if the reservation was lost. |
} while (0 == StoreConditional()); |
// Keep track of the minimum time seen so far. |
if (ClockLessThan(d1, m1)) |
m1 = d1; |
} |
/* Step 3: Calculate the time to execute <iterations> iterations as the |
difference between the time to execute 2 iterations and the time to |
execute 2+<iterations> iterations. |
*/ |
return SubtractClock(m1, m0); |
} |
/* Return the number of CPU cycles in one clock tick determined by hardware |
means, such as executing code with a known execution time. |
*/ |
static double CPUCyclesPerTickViaHardware(void) |
{ |
/* Get the number of CPU cycles one iteration of the loop in |
ConsumeCPUCycles takes. |
*/ |
unsigned int Cycles = GetCPUCyclesCycles(); |
// Give up if we do not know how long the loop takes on this CPU. |
if (Cycles == 0) |
return 0; |
static const int Iterations = 5400; |
// Get the number of clock ticks Iterations iterations of the loop take. |
ClockValue Ticks = MeasureNetTime(ConsumeCPUCycles, Iterations, NULL, 1000); |
// Return cycles per tick. |
return Cycles * Iterations / ClockToDouble(Ticks); |
} |
/* Return the number of CPU cycles in one clock tick determined by asking |
the operating system. |
*/ |
static double CPUCyclesPerTickViaSystem(void) |
{ |
#if TimingMethod == TM_TB || TimingMethod == TM_UT |
/* The time-base registers (which UpTime uses) should tick at |
hw.tbfrequency, and the CPU should tick at hw.cpufrequency, so the |
CPU cycles per time-base tick should be hw.cpufrequency / |
hw.tbfrequency. |
*/ |
uint64_t CPUFrequency, TBFrequency; |
size_t SizeOf = sizeof CPUFrequency; |
if (0 != sysctlbyname("hw.cpufrequency", |
&CPUFrequency, &SizeOf, NULL, 0)) |
{ |
fprintf(stderr, |
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n"); |
return 0; |
} |
if (0 != sysctlbyname("hw.tbfrequency", |
&TBFrequency, &SizeOf, NULL, 0)) |
{ |
fprintf(stderr, |
"Warning, unable to get hw.tbfrequency from sysctlbyname.\n"); |
return 0; |
} |
return (double) CPUFrequency / TBFrequency; |
#elif TimingMethod == TM_PMC |
/* The PowerPC performance monitor counter event we use should be one |
CPU cycle per tick. |
*/ |
return 1; |
#elif TimingMethod == TM_TOD |
/* The CPU should tick at hw.cpufrequency, so the CPU cycles per |
microsecond tick should be hw.cpufrequency / 1e6. |
*/ |
uint64_t CPUFrequency; |
size_t SizeOf = sizeof CPUFrequency; |
if (0 != sysctlbyname("hw.cpufrequency", |
&CPUFrequency, &SizeOf, NULL, 0)) |
{ |
fprintf(stderr, |
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n"); |
return 0; |
} |
return CPUFrequency / 1e6; |
#elif TimingMethod == TM_MACH |
/* Get ratio of mach_absolute_time ticks to nanoseconds. |
(One nanosecond is numer/denom times one tick.) |
*/ |
mach_timebase_info_data_t data; |
mach_timebase_info(&data); |
// Get what system thinks CPU frequency is. |
uint64_t CPUFrequency; |
size_t SizeOf = sizeof CPUFrequency; |
if (0 != sysctlbyname("hw.cpufrequency", |
&CPUFrequency, &SizeOf, NULL, 0)) |
{ |
fprintf(stderr, |
"Warning, unable to get hw.cpufrequency from sysctlbyname.\n"); |
return 0; |
} |
/* Calculate expected value as number of CPU cycles in a second |
(CPUFrequency) divided by number of mach_absolute_time ticks |
in a billion nanoseconds. |
*/ |
return CPUFrequency / (1e9 * data.denom / data.numer); |
#else // TimingMethod. |
#error "Code is not defined for selected timing method." |
/* TM_TSC is not supported. If we want to check the processor |
model, we could return 1 for: |
family 6, models 9 and 13; |
family 15, models 0, 1, and 2; |
P6 family processors. |
For other models, we have no way to convert the time stamp counter |
to cycles, because, according to Intel's specification, the |
increment rate "may be set" by the maximum core-clock to bus-clock |
ratio of the processor or by the frequency at which the processor |
is booted. Additional examination of the documentation for each |
processor model would be needed, and it would be impossible to |
support unexamined processors. |
We could return 0 and let the hardware CPUCyclesPerTickViaHardware |
routine control the value used, but it also cannot support |
unexamined processors. |
*/ |
#endif // TimingMethod. |
} |
// Return the number of CPU cycles in one clock tick. |
static double CPUCyclesPerTick(void) |
{ |
/* Cache the number in a static object and return it if we have |
computed it previously. |
*/ |
static double CachedCPUCyclesPerTick = 0; |
if (CachedCPUCyclesPerTick != 0) |
return CachedCPUCyclesPerTick; |
double CyclesPerTickViaHardware = CPUCyclesPerTickViaHardware(); |
double CyclesPerTickViaSystem = CPUCyclesPerTickViaSystem (); |
if (CyclesPerTickViaHardware == 0) |
{ |
// This must be a CPU model we are unfamiliar with. |
if (CyclesPerTickViaSystem == 0) |
{ |
fprintf(stderr, "Error, no value is available for the number of CPU cycles per clock tick.\n"); |
exit(EXIT_FAILURE); |
} |
else |
CachedCPUCyclesPerTick = CyclesPerTickViaSystem; |
} |
else |
if (CyclesPerTickViaSystem == 0) |
CachedCPUCyclesPerTick = CyclesPerTickViaHardware; |
else |
{ |
const double Tolerance = .004; // Maximum relative error allowed. |
// Compare observed and expected ratios. |
if (Tolerance < |
fabs(CyclesPerTickViaSystem / CyclesPerTickViaHardware - 1)) |
fprintf(stderr, |
"Warning, hardware and operating system disagree about the number of CPU\n" |
"cycles per clock tick.\n" |
"\tHardware indicates %g cycles per tick.\n" |
"\tSystem indicates %g cycles per tick.\n" |
"\tUsing system value.\n", |
CyclesPerTickViaHardware, |
CyclesPerTickViaSystem); |
CachedCPUCyclesPerTick = CyclesPerTickViaSystem; |
} |
return CachedCPUCyclesPerTick; |
} |
/* ClockToCPUCycles. |
Convert time (in a ClockValue object) to number of CPU cycles in that time. |
*/ |
double ClockToCPUCycles(ClockValue t) |
{ |
return ClockToDouble(t) * CPUCyclesPerTick(); |
} |
#include <time.h> |
// See header file for description of this routine. |
double MeasureNetTimeInCPUCycles( |
RoutineToBeTimedType routine, |
unsigned int iterations, |
void *data, |
unsigned int samples |
) |
{ |
return ClockToCPUCycles(MeasureNetTime(routine, iterations, data, samples)) |
/ iterations; |
} |
Copyright © 2008 Apple Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2008-06-06