diff options
author | Bruce Beare <brucex.j.beare@intel.com> | 2010-03-04 10:30:51 -0800 |
---|---|---|
committer | Bruce Beare <brucex.j.beare@intel.com> | 2010-03-04 10:30:51 -0800 |
commit | 751a4435c3ef705b0e31faa14c75b9e2fd859959 (patch) | |
tree | bf347a463c948f0fc91d842f84f04f5e64954155 | |
parent | 56b2728f588644e40195908e68985b9214fcfe8a (diff) | |
download | oprofile-751a4435c3ef705b0e31faa14c75b9e2fd859959.tar.gz |
x86 oprofile enablement
-rw-r--r-- | Android.mk | 2 | ||||
-rw-r--r-- | libop/op_hw_specific.h | 15 | ||||
-rw-r--r-- | opcontrol/opcontrol.cpp | 176 |
3 files changed, 143 insertions, 50 deletions
@@ -1,3 +1 @@ -ifeq ($(TARGET_ARCH),arm) include $(call all-subdir-makefiles) -endif diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h index 35080ad..b1e3b4c 100644 --- a/libop/op_hw_specific.h +++ b/libop/op_hw_specific.h @@ -20,7 +20,8 @@ static inline int cpuid_vendor(char *vnd) char v[12]; } v; unsigned eax; - asm("cpuid" : "=a" (eax), "=b" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0)); + asm volatile( "pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx" + : "=a" (eax), "=S" (v.b), "=c" (v.c), "=d" (v.d) : "0" (0)); return !strncmp(v.v, vnd, 12); } @@ -46,7 +47,8 @@ static inline void workaround_nehalem_aaj79(unsigned *ebx) if (!cpuid_vendor("GenuineIntel")) return; - asm("cpuid" : "=a" (v.eax) : "0" (1) : "ecx","ebx","edx"); + asm volatile( "pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx" + : "=a" (v.eax) : "0" (1) : "ecx","edx"); model = (v.ext_model << 4) + v.model; if (v.family != 6 || model != 26 || v.stepping > 4) return; @@ -57,7 +59,8 @@ static inline unsigned arch_get_filter(op_cpu cpu_type) { if (cpu_type == CPU_ARCH_PERFMON) { unsigned ebx, eax; - asm("cpuid" : "=a" (eax), "=b" (ebx) : "0" (0xa) : "ecx","edx"); + asm volatile( "pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx" + : "=a" (eax), "=S" (ebx) : "0" (0xa) : "ecx","edx"); workaround_nehalem_aaj79(&ebx); return ebx & num_to_mask(eax >> 24); } @@ -68,7 +71,8 @@ static inline int arch_num_counters(op_cpu cpu_type) { if (cpu_type == CPU_ARCH_PERFMON) { unsigned v; - asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx"); + asm volatile( "pushl %%ebx; cpuid; movl %%eax, %1; popl %%ebx" + : "=a" (v) : "0" (0xa) : "ecx","edx"); return (v >> 8) & 0xff; } return -1; @@ -77,7 +81,8 @@ static inline int arch_num_counters(op_cpu cpu_type) static inline unsigned arch_get_counter_mask(void) { unsigned v; - asm("cpuid" : "=a" (v) : "0" (0xa) : "ebx","ecx","edx"); + asm volatile( "pushl %%ebx; cpuid; movl %%ebx, %1; popl %%ebx" + : "=a" (v) : "0" (0xa) : "ecx","edx"); return num_to_mask((v >> 8) & 0xff); } diff --git a/opcontrol/opcontrol.cpp b/opcontrol/opcontrol.cpp index 2558760..4cfc595 100644 --- a/opcontrol/opcontrol.cpp +++ b/opcontrol/opcontrol.cpp @@ -30,17 +30,16 @@ #include "op_config.h" -#if 0 -#define verbose(fmt...) printf(fmt) -#else -#define verbose(fmt...) -#endif +#define verbose(fmt...) if (verbose_print) printf(fmt) /* Experiments found that using a small interval may hang the device, and the * more events tracked simultaneously, the longer the interval has to be. */ -#if !defined(WITH_ARM_V7_A) +#if defined(__i386__) || defined(__x86_64__) +#define MAX_EVENTS 2 +int min_count[MAX_EVENTS] = {60000, 100000}; +#elif !defined(WITH_ARM_V7_A) #define MAX_EVENTS 3 int min_count[MAX_EVENTS] = {150000, 200000, 250000}; #else @@ -48,6 +47,7 @@ int min_count[MAX_EVENTS] = {150000, 200000, 250000}; int min_count[MAX_EVENTS] = {150000, 200000, 250000, 300000, 350000}; #endif +int verbose_print; int list_events; int show_usage; int setup; @@ -60,6 +60,7 @@ int reset; int selected_events[MAX_EVENTS]; int selected_counts[MAX_EVENTS]; +char callgraph[8]; char kernel_range[512]; char vmlinux[512]; @@ -69,84 +70,133 @@ struct option long_options[] = { {"reset", 0, &reset, 1}, {"setup", 0, &setup, 1}, {"quick", 0, &quick, 1}, + {"callgraph", 1, 0, 'c'}, {"event", 1, 0, 'e'}, {"vmlinux", 1, 0, 'v'}, {"kernel-range", 1, 0, 'r'}, {"start", 0, &start, 1}, {"stop", 0, &stop, 1}, + {"dump", 0, 0, 'd'}, {"shutdown", 0, 0, 'h'}, {"status", 0, 0, 't'}, + {"verbose", 0, 0, 'V'}, {0, 0, 0, 0}, }; struct event_info { int id; + int um; const char *name; const char *explanation; } event_info[] = { -#if !defined(WITH_ARM_V7_A) +#if defined(__i386__) || defined(__x86_64__) + /* INTEL_ARCH_PERFMON events */ + + /* 0x3c counters:cpuid um:zero minimum:6000 filter:0 name:CPU_CLK_UNHALTED : + * Clock cycles when not halted + */ + {0x3c, 0, "CPU_CLK_UNHALTED", + "Clock cycles when not halted" }, + + /* event:0x3c counters:cpuid um:one minimum:6000 filter:2 name:UNHALTED_REFERENCE_CYCLES : + * Unhalted reference cycles + */ + {0x3c, 1, "UNHALTED_REFERENCE_CYCLES", + "Unhalted reference cycles" }, + + /* event:0xc0 counters:cpuid um:zero minimum:6000 filter:1 name:INST_RETIRED : + * number of instructions retired + */ + {0xc0, 0, "INST_RETIRED", + "number of instructions retired"}, + + /* event:0x2e counters:cpuid um:x41 minimum:6000 filter:5 name:LLC_MISSES : + * Last level cache demand requests from this core that missed the LLC + */ + {0x2e, 0x41, "LLC_MISSES", + "Last level cache demand requests from this core that missed the LLC"}, + + /* event:0x2e counters:cpuid um:x4f minimum:6000 filter:4 name:LLC_REFS : + * Last level cache demand requests from this core + */ + {0x2e, 0x4f, "LLC_REFS", + "Last level cache demand requests from this core"}, + + /* event:0xc4 counters:cpuid um:zero minimum:500 filter:6 name:BR_INST_RETIRED : + * number of branch instructions retired + */ + {0xc4, 0, "BR_INST_RETIRED", + "number of branch instructions retired"}, + + /* event:0xc5 counters:cpuid um:zero minimum:500 filter:7 name:BR_MISS_PRED_RETIRED : + * number of mispredicted branches retired (precise) + */ + {0xc5, 0, "BR_MISS_PRED_RETIRED", + "number of mispredicted branches retired (precise)"}, + +#elif !defined(WITH_ARM_V7_A) /* ARM V6 events */ - {0x00, "IFU_IFETCH_MISS", + {0x00, 0, "IFU_IFETCH_MISS", "number of instruction fetch misses"}, - {0x01, "CYCLES_IFU_MEM_STALL", + {0x01, 0, "CYCLES_IFU_MEM_STALL", "cycles instruction fetch pipe is stalled"}, - {0x02, "CYCLES_DATA_STALL", + {0x02, 0, "CYCLES_DATA_STALL", "cycles stall occurs for due to data dependency"}, - {0x03, "ITLB_MISS", + {0x03, 0, "ITLB_MISS", "number of Instruction MicroTLB misses"}, - {0x04, "DTLB_MISS", + {0x04, 0, "DTLB_MISS", "number of Data MicroTLB misses"}, - {0x05, "BR_INST_EXECUTED", + {0x05, 0, "BR_INST_EXECUTED", "branch instruction executed w/ or w/o program flow change"}, - {0x06, "BR_INST_MISS_PRED", + {0x06, 0, "BR_INST_MISS_PRED", "branch mispredicted"}, - {0x07, "INSN_EXECUTED", + {0x07, 0, "INSN_EXECUTED", "instructions executed"}, - {0x09, "DCACHE_ACCESS", + {0x09, 0, "DCACHE_ACCESS", "data cache access, cacheable locations"}, - {0x0a, "DCACHE_ACCESS_ALL", + {0x0a, 0, "DCACHE_ACCESS_ALL", "data cache access, all locations"}, - {0x0b, "DCACHE_MISS", + {0x0b, 0, "DCACHE_MISS", "data cache miss"}, - {0x0c, "DCACHE_WB", + {0x0c, 0, "DCACHE_WB", "data cache writeback, 1 event for every half cacheline"}, - {0x0d, "PC_CHANGE", + {0x0d, 0, "PC_CHANGE", "number of times the program counter was changed without a mode switch"}, - {0x0f, "TLB_MISS", + {0x0f, 0, "TLB_MISS", "Main TLB miss"}, - {0x10, "EXP_EXTERNAL", + {0x10, 0, "EXP_EXTERNAL", "Explicit external data access"}, - {0x11, "LSU_STALL", + {0x11, 0, "LSU_STALL", "cycles stalled because Load Store request queue is full"}, - {0x12, "WRITE_DRAIN", + {0x12, 0, "WRITE_DRAIN", "Times write buffer was drained"}, - {0xff, "CPU_CYCLES", + {0xff, 0, "CPU_CYCLES", "clock cycles counter"}, #else /* ARM V7 events */ - {0x00, "PMNC_SW_INCR", + {0x00, 0, "PMNC_SW_INCR", "Software increment of PMNC registers"}, - {0x01, "IFETCH_MISS", + {0x01, 0, "IFETCH_MISS", "Instruction fetch misses from cache or normal cacheable memory"}, - {0x02, "ITLB_MISS", + {0x02, 0, "ITLB_MISS", "Instruction fetch misses from TLB"}, - {0x03, "DCACHE_REFILL", + {0x03, 0, "DCACHE_REFILL", "Data R/W operation that causes a refill from cache or normal cacheable" "memory"}, - {0x04, "DCACHE_ACCESS", + {0x04, 0, "DCACHE_ACCESS", "Data R/W from cache"}, - {0x05, "DTLB_REFILL", + {0x05, 0, "DTLB_REFILL", "Data R/W that causes a TLB refill"}, - {0x06, "DREAD", + {0x06, 0, "DREAD", "Data read architecturally executed (note: architecturally executed = for" "instructions that are unconditional or that pass the condition code)"}, - {0x07, "DWRITE", + {0x07, 0, "DWRITE", "Data write architecturally executed"}, - {0x08, "INSTR_EXECUTED", + {0x08, 0, "INSTR_EXECUTED", "All executed instructions"}, - {0x09, "EXC_TAKEN", + {0x09, 0, "EXC_TAKEN", "Exception taken"}, - {0x0A, "EXC_EXECUTED", + {0x0A, 0, "EXC_EXECUTED", "Exception return architecturally executed"}, {0x0B, "CID_WRITE", "Instruction that writes to the Context ID Register architecturally" @@ -238,13 +288,19 @@ void usage() printf("\nopcontrol: usage:\n" " --list-events list event types\n" " --help this message\n" + " --verbose show extra status\n" " --setup setup directories\n" +#if defined(__i386__) || defined(__x86_64__) + " --quick setup and select CPU_CLK_UNHALTED:60000\n" +#else " --quick setup and select CPU_CYCLES:150000\n" +#endif " --status show configuration\n" " --start start data collection\n" " --stop stop data collection\n" " --reset clears out data from current session\n" " --shutdown kill the oprofile daeman\n" + " --callgraph=depth callgraph depth\n" " --event=eventspec\n" " Choose an event. May be specified multiple times.\n" " eventspec is in the form of name[:count], where :\n" @@ -459,8 +515,9 @@ void do_status() printf(" %9u samples received\n", num); num = read_num(OP_DRIVER_BASE"/stats/cpu0/sample_lost_overflow"); printf(" %9u samples lost overflow\n", num); -#if 0 - /* FIXME - backtrace seems broken */ + +#if defined(__i386__) || defined(__x86_64__) + /* FIXME on ARM - backtrace seems broken there */ num = read_num(OP_DRIVER_BASE"/stats/cpu0/backtrace_aborted"); printf(" %9u backtrace aborted\n", num); num = read_num(OP_DRIVER_BASE"/backtrace_depth"); @@ -495,13 +552,17 @@ int main(int argc, char * const argv[]) strcpy(kernel_range, ""); while (1) { - int c = getopt_long(argc, argv, "", long_options, &option_index); + int c = getopt_long(argc, argv, "c:e:v:r:dhVt", long_options, &option_index); if (c == -1) { break; } switch (c) { case 0: break; + /* --callgraph */ + case 'c': + strncpy(callgraph, optarg, sizeof(callgraph)); + break; /* --event */ case 'e': if (num_events == MAX_EVENTS) { @@ -521,15 +582,33 @@ int main(int argc, char * const argv[]) case 'r': sprintf(kernel_range, "-r %s", optarg); break; + case 'd': + /* --dump */ { + int pid = read_num(OP_DATA_DIR"/lock"); + echo_dev("1", 0, "dump", -1); + if (pid >= 0) { + sleep(1); + kill(pid, SIGHUP); + } + break; + } /* --shutdown */ case 'h': { int pid = read_num(OP_DATA_DIR"/lock"); if (pid >= 0) { + kill(pid, SIGHUP); /* Politely ask the daemon to close files */ + sleep(1); + kill(pid, SIGTERM);/* Politely ask the daemon to die */ + sleep(1); kill(pid, SIGKILL); } setup_session_dir(); break; } + /* --verbose */ + case 'V': + verbose_print++; + break; /* --status */ case 't': do_status(); @@ -547,7 +626,11 @@ int main(int argc, char * const argv[]) } if (quick) { +#if defined(__i386__) || defined(__x86_64__) + process_event("CPU_CLK_UNHALTED"); +#else process_event("CPU_CYCLES"); +#endif setup = 1; } @@ -566,12 +649,18 @@ int main(int argc, char * const argv[]) } } + if (strlen(callgraph)) { + echo_dev(callgraph, 0, "backtrace_depth", -1); + } + if (num_events != 0) { int i; strcpy(command, "oprofiled --session-dir="OP_DATA_DIR); -#if !defined(WITH_ARM_V7_A) +#if defined(__i386__) || defined(__x86_64__) + /* Nothing */ +#elif !defined(WITH_ARM_V7_A) /* Since counter #3 can only handle CPU_CYCLES, check and shuffle the * order a bit so that the maximal number of events can be profiled * simultaneously @@ -624,15 +713,16 @@ int main(int argc, char * const argv[]) * --events=CYCLES_DATA_STALL:2:0:200000:0:1:1,.... */ snprintf(command+strlen(command), 1024 - strlen(command), - "%s:%d:%d:%d:0:1:1", + "%s:%d:%d:%d:%d:1:1", event_info[event_idx].name, event_info[event_idx].id, i, - selected_counts[i]); + selected_counts[i], + event_info[event_idx].um); setup_result |= echo_dev("1", 0, "user", i); setup_result |= echo_dev("1", 0, "kernel", i); - setup_result |= echo_dev("0", 0, "unit_mask", i); + setup_result |= echo_dev(NULL, event_info[event_idx].um, "unit_mask", i); setup_result |= echo_dev("1", 0, "enabled", i); setup_result |= echo_dev(NULL, selected_counts[i], "count", i); setup_result |= echo_dev(NULL, event_info[event_idx].id, |