original/man2/perf_event_open.2

   1 .\" Copyright (c) 2012, Vincent Weaver
   2 .\"
   3 .\" %%%LICENSE_START(GPLv2+_DOC_FULL)
   4 .\" This is free documentation; you can redistribute it and/or
   5 .\" modify it under the terms of the GNU General Public License as
   6 .\" published by the Free Software Foundation; either version 2 of
   7 .\" the License, or (at your option) any later version.
   8 .\"
   9 .\" The GNU General Public License's references to "object code"
  10 .\" and "executables" are to be interpreted as the output of any
  11 .\" document formatting or typesetting system, including
  12 .\" intermediate and printed output.
  13 .\"
  14 .\" This manual is distributed in the hope that it will be useful,
  15 .\" but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 .\" GNU General Public License for more details.
  18 .\"
  19 .\" You should have received a copy of the GNU General Public
  20 .\" License along with this manual; if not, see
  21 .\" <http://www.gnu.org/licenses/>.
  22 .\" %%%LICENSE_END
  23 .\"
  24 .\" This document is based on the perf_event.h header file, the
  25 .\" tools/perf/design.txt file, and a lot of bitter experience.
  26 .\"
  27 .TH PERF_EVENT_OPEN 2 2014-04-06 "Linux" "Linux Programmer's Manual"
  28 .SH NAME
  29 perf_event_open \- set up performance monitoring
  30 .SH SYNOPSIS
  31 .nf
  32 .B #include <linux/perf_event.h>
  33 .B #include <linux/hw_breakpoint.h>
  34 .sp
  35 .BI "int perf_event_open(struct perf_event_attr *" attr ,
  36 .BI "                    pid_t " pid ", int " cpu ", int " group_fd ,
  37 .BI "                    unsigned long " flags  );
  38 .fi
  39
  40 .IR Note :
  41 There is no glibc wrapper for this system call; see NOTES.
  42 .SH DESCRIPTION
  43 Given a list of parameters,
  44 .BR perf_event_open ()
  45 returns a file descriptor, for use in subsequent system calls
  46 .RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
  47 .PP
  48 A call to
  49 .BR perf_event_open ()
  50 creates a file descriptor that allows measuring performance
  51 information.
  52 Each file descriptor corresponds to one
  53 event that is measured; these can be grouped together
  54 to measure multiple events simultaneously.
  55 .PP
  56 Events can be enabled and disabled in two ways: via
  57 .BR ioctl (2)
  58 and via
  59 .BR prctl (2).
  60 When an event is disabled it does not count or generate overflows but does
  61 continue to exist and maintain its count value.
  62 .PP
  63 Events come in two flavors: counting and sampled.
  64 A
  65 .I counting
  66 event is one that is used for counting the aggregate number of events
  67 that occur.
  68 In general, counting event results are gathered with a
  69 .BR read (2)
  70 call.
  71 A
  72 .I sampling
  73 event periodically writes measurements to a buffer that can then
  74 be accessed via
  75 .BR mmap (2).
  76 .SS Arguments
  77 .P
  78 The
  79 .I pid
  80 and
  81 .I cpu
  82 arguments allow specifying which process and CPU to monitor:
  83 .TP
  84 .BR "pid == 0" " and " "cpu == \-1"
  85 This measures the current process/thread on any CPU.
  86 .TP
  87 .BR "pid == 0" " and " "cpu >= 0"
  88 This measures the current process/thread only
  89 when running on the specified CPU.
  90 .TP
  91 .BR "pid > 0" " and " "cpu == \-1"
  92 This measures the specified process/thread on any CPU.
  93 .TP
  94 .BR "pid > 0" " and " "cpu >= 0"
  95 This measures the specified process/thread only
  96 when running on the specified CPU.
  97 .TP
  98 .BR "pid == \-1" " and " "cpu >= 0"
  99 This measures all processes/threads on the specified CPU.
 100 Measurements such as this require the
 101 .B CAP_SYS_ADMIN
 102 capability or a
 103 .I /proc/sys/kernel/perf_event_paranoid
 104 value of less than 1.
 105 .TP
 106 .BR pid==\-1 " and " cpu==\-1
 107 This setting is invalid and will return an error.
 108 .P
 109 The
 110 .I group_fd
 111 argument allows event groups to be created.
 112 An event group has one event which is the group leader.
 113 The leader is created first, with
 114 .IR group_fd " = \-1."
 115 The rest of the group members are created with subsequent
 116 .BR perf_event_open ()
 117 calls with
 118 .IR group_fd
 119 being set to the fd of the group leader.
 120 (A single event on its own is created with
 121 .IR group_fd " = \-1"
 122 and is considered to be a group with only 1 member.)
 123 An event group is scheduled onto the CPU as a unit: it will
 124 be put onto the CPU only if all of the events in the group can be put onto
 125 the CPU.
 126 This means that the values of the member events can be
 127 meaningfully compared, added, divided (to get ratios), and so on, with each
 128 other, since they have counted events for the same set of executed
 129 instructions.
 130 .P
 131 The
 132 .I flags
 133 argument is formed by ORing together zero or more of the following values:
 134 .TP
 135 .BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)."
 136 This flag enables the close-on-exec flag for the created
 137 event file descriptor,
 138 so that the file descriptor is automatically closed on
 139 .BR execve (2).
 140 Setting the close-on-exec flags at creation time, rather than later with
 141 .BR fcntl (2),
 142 avoids potential race conditions where the calling thread invokes
 143 .BR perf_event_open ()
 144 at the same time as another thread calls
 145 .BR fork (2)
 146 then
 147 .BR execve (2).
 148 .TP
 149 .BR PERF_FLAG_FD_NO_GROUP
 150 .\" FIXME The following sentence is unclear
 151 This flag allows creating an event as part of an event group but
 152 having no group leader.
 153 It is unclear why this is useful.
 154 .\" FIXME So, why is it useful?
 155 .TP
 156 .BR PERF_FLAG_FD_OUTPUT
 157 This flag reroutes the output from an event to the group leader.
 158 .TP
 159 .BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)."
 160 This flag activates per-container system-wide monitoring.
 161 A container
 162 is an abstraction that isolates a set of resources for finer grain
 163 control (CPUs, memory, etc.).
 164 In this mode, the event is measured
 165 only if the thread running on the monitored CPU belongs to the designated
 166 container (cgroup).
 167 The cgroup is identified by passing a file descriptor
 168 opened on its directory in the cgroupfs filesystem.
 169 For instance, if the
 170 cgroup to monitor is called
 171 .IR test ,
 172 then a file descriptor opened on
 173 .I /dev/cgroup/test
 174 (assuming cgroupfs is mounted on
 175 .IR /dev/cgroup )
 176 must be passed as the
 177 .I pid
 178 parameter.
 179 cgroup monitoring is available only
 180 for system-wide events and may therefore require extra permissions.
 181 .P
 182 The
 183 .I perf_event_attr
 184 structure provides detailed configuration information
 185 for the event being created.
 186
 187 .in +4n
 188 .nf
 189 struct perf_event_attr {
 190     __u32     type;         /* Type of event */
 191     __u32     size;         /* Size of attribute structure */
 192     __u64     config;       /* Type-specific configuration */
 193
 194     union {
 195         __u64 sample_period;    /* Period of sampling */
 196         __u64 sample_freq;      /* Frequency of sampling */
 197     };
 198
 199     __u64     sample_type;  /* Specifies values included in sample */
 200     __u64     read_format;  /* Specifies values returned in read */
 201
 202     __u64     disabled       : 1,   /* off by default */
 203               inherit        : 1,   /* children inherit it */
 204               pinned         : 1,   /* must always be on PMU */
 205               exclusive      : 1,   /* only group on PMU */
 206               exclude_user   : 1,   /* don't count user */
 207               exclude_kernel : 1,   /* don't count kernel */
 208               exclude_hv     : 1,   /* don't count hypervisor */
 209               exclude_idle   : 1,   /* don't count when idle */
 210               mmap           : 1,   /* include mmap data */
 211               comm           : 1,   /* include comm data */
 212               freq           : 1,   /* use freq, not period */
 213               inherit_stat   : 1,   /* per task counts */
 214               enable_on_exec : 1,   /* next exec enables */
 215               task           : 1,   /* trace fork/exit */
 216               watermark      : 1,   /* wakeup_watermark */
 217               precise_ip     : 2,   /* skid constraint */
 218               mmap_data      : 1,   /* non-exec mmap data */
 219               sample_id_all  : 1,   /* sample_type all events */
 220               exclude_host   : 1,   /* don't count in host */
 221               exclude_guest  : 1,   /* don't count in guest */
 222               exclude_callchain_kernel : 1,
 223                                     /* exclude kernel callchains */
 224               exclude_callchain_user   : 1,
 225                                     /* exclude user callchains */
 226               __reserved_1   : 41;
 227
 228     union {
 229         __u32 wakeup_events;    /* wakeup every n events */
 230         __u32 wakeup_watermark; /* bytes before wakeup */
 231     };
 232
 233     __u32     bp_type;          /* breakpoint type */
 234
 235     union {
 236         __u64 bp_addr;          /* breakpoint address */
 237         __u64 config1;          /* extension of config */
 238     };
 239
 240     union {
 241         __u64 bp_len;           /* breakpoint length */
 242         __u64 config2;          /* extension of config1 */
 243     };
 244     __u64   branch_sample_type; /* enum perf_branch_sample_type */
 245     __u64   sample_regs_user;   /* user regs to dump on samples */
 246     __u32   sample_stack_user;  /* size of stack to dump on
 247                                    samples */
 248     __u32   __reserved_2;       /* Align to u64 */
 249
 250 };
 251 .fi
 252 .in
 253
 254 The fields of the
 255 .I perf_event_attr
 256 structure are described in more detail below:
 257 .TP
 258 .I type
 259 This field specifies the overall event type.
 260 It has one of the following values:
 261 .RS
 262 .TP
 263 .B PERF_TYPE_HARDWARE
 264 This indicates one of the "generalized" hardware events provided
 265 by the kernel.
 266 See the
 267 .I config
 268 field definition for more details.
 269 .TP
 270 .B PERF_TYPE_SOFTWARE
 271 This indicates one of the software-defined events provided by the kernel
 272 (even if no hardware support is available).
 273 .TP
 274 .B PERF_TYPE_TRACEPOINT
 275 This indicates a tracepoint
 276 provided by the kernel tracepoint infrastructure.
 277 .TP
 278 .B PERF_TYPE_HW_CACHE
 279 This indicates a hardware cache event.
 280 This has a special encoding, described in the
 281 .I config
 282 field definition.
 283 .TP
 284 .B PERF_TYPE_RAW
 285 This indicates a "raw" implementation-specific event in the
 286 .IR config " field."
 287 .TP
 288 .BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
 289 This indicates a hardware breakpoint as provided by the CPU.
 290 Breakpoints can be read/write accesses to an address as well as
 291 execution of an instruction address.
 292 .TP
 293 .RB "dynamic PMU"
 294 Since Linux 2.6.39,
 295 .BR perf_event_open ()
 296 can support multiple PMUs.
 297 To enable this, a value exported by the kernel can be used in the
 298 .I type
 299 field to indicate which PMU to use.
 300 The value to use can be found in the sysfs filesystem:
 301 there is a subdirectory per PMU instance under
 302 .IR /sys/bus/event_source/devices .
 303 In each subdirectory there is a
 304 .I type
 305 file whose content is an integer that can be used in the
 306 .I type
 307 field.
 308 For instance,
 309 .I /sys/bus/event_source/devices/cpu/type
 310 contains the value for the core CPU PMU, which is usually 4.
 311 .RE
 312 .TP
 313 .I "size"
 314 The size of the
 315 .I perf_event_attr
 316 structure for forward/backward compatibility.
 317 Set this using
 318 .I sizeof(struct perf_event_attr)
 319 to allow the kernel to see
 320 the struct size at the time of compilation.
 321
 322 The related define
 323 .B PERF_ATTR_SIZE_VER0
 324 is set to 64; this was the size of the first published struct.
 325 .B PERF_ATTR_SIZE_VER1
 326 is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
 327 .B PERF_ATTR_SIZE_VER2
 328 is 80 corresponding to the addition of branch sampling in Linux 3.4.
 329 .B PERF_ATR_SIZE_VER3
 330 is 96 corresponding to the addition
 331 of
 332 .I sample_regs_user
 333 and
 334 .I sample_stack_user
 335 in Linux 3.7.
 336 .TP
 337 .I "config"
 338 This specifies which event you want, in conjunction with
 339 the
 340 .I type
 341 field.
 342 The
 343 .IR config1 " and " config2
 344 fields are also taken into account in cases where 64 bits is not
 345 enough to fully specify the event.
 346 The encoding of these fields are event dependent.
 347
 348 The most significant bit (bit 63) of
 349 .I config
 350 signifies CPU-specific (raw) counter configuration data;
 351 if the most significant bit is unset, the next 7 bits are an event
 352 type and the rest of the bits are the event identifier.
 353
 354 There are various ways to set the
 355 .I config
 356 field that are dependent on the value of the previously
 357 described
 358 .I type
 359 field.
 360 What follows are various possible settings for
 361 .I config
 362 separated out by
 363 .IR type .
 364
 365 If
 366 .I type
 367 is
 368 .BR PERF_TYPE_HARDWARE ,
 369 we are measuring one of the generalized hardware CPU events.
 370 Not all of these are available on all platforms.
 371 Set
 372 .I config
 373 to one of the following:
 374 .RS 12
 375 .TP
 376 .B PERF_COUNT_HW_CPU_CYCLES
 377 Total cycles.
 378 Be wary of what happens during CPU frequency scaling.
 379 .TP
 380 .B PERF_COUNT_HW_INSTRUCTIONS
 381 Retired instructions.
 382 Be careful, these can be affected by various
 383 issues, most notably hardware interrupt counts.
 384 .TP
 385 .B PERF_COUNT_HW_CACHE_REFERENCES
 386 Cache accesses.
 387 Usually this indicates Last Level Cache accesses but this may
 388 vary depending on your CPU.
 389 This may include prefetches and coherency messages; again this
 390 depends on the design of your CPU.
 391 .TP
 392 .B PERF_COUNT_HW_CACHE_MISSES
 393 Cache misses.
 394 Usually this indicates Last Level Cache misses; this is intended to be
 395 used in conjunction with the
 396 .B PERF_COUNT_HW_CACHE_REFERENCES
 397 event to calculate cache miss rates.
 398 .TP
 399 .B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 400 Retired branch instructions.
 401 Prior to Linux 2.6.34, this used
 402 the wrong event on AMD processors.
 403 .TP
 404 .B PERF_COUNT_HW_BRANCH_MISSES
 405 Mispredicted branch instructions.
 406 .TP
 407 .B PERF_COUNT_HW_BUS_CYCLES
 408 Bus cycles, which can be different from total cycles.
 409 .TP
 410 .BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
 411 Stalled cycles during issue.
 412 .TP
 413 .BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND  " (since Linux 3.0)"
 414 Stalled cycles during retirement.
 415 .TP
 416 .BR PERF_COUNT_HW_REF_CPU_CYCLES  " (since Linux 3.3)"
 417 Total cycles; not affected by CPU frequency scaling.
 418 .RE
 419 .IP
 420 If
 421 .I type
 422 is
 423 .BR PERF_TYPE_SOFTWARE ,
 424 we are measuring software events provided by the kernel.
 425 Set
 426 .I config
 427 to one of the following:
 428 .RS 12
 429 .TP
 430 .B PERF_COUNT_SW_CPU_CLOCK
 431 This reports the CPU clock, a high-resolution per-CPU timer.
 432 .TP
 433 .B PERF_COUNT_SW_TASK_CLOCK
 434 This reports a clock count specific to the task that is running.
 435 .TP
 436 .B PERF_COUNT_SW_PAGE_FAULTS
 437 This reports the number of page faults.
 438 .TP
 439 .B PERF_COUNT_SW_CONTEXT_SWITCHES
 440 This counts context switches.
 441 Until Linux 2.6.34, these were all reported as user-space
 442 events, after that they are reported as happening in the kernel.
 443 .TP
 444 .B PERF_COUNT_SW_CPU_MIGRATIONS
 445 This reports the number of times the process
 446 has migrated to a new CPU.
 447 .TP
 448 .B PERF_COUNT_SW_PAGE_FAULTS_MIN
 449 This counts the number of minor page faults.
 450 These did not require disk I/O to handle.
 451 .TP
 452 .B PERF_COUNT_SW_PAGE_FAULTS_MAJ
 453 This counts the number of major page faults.
 454 These required disk I/O to handle.
 455 .TP
 456 .BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
 457 This counts the number of alignment faults.
 458 These happen when unaligned memory accesses happen; the kernel
 459 can handle these but it reduces performance.
 460 This happens only on some architectures (never on x86).
 461 .TP
 462 .BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
 463 This counts the number of emulation faults.
 464 The kernel sometimes traps on unimplemented instructions
 465 and emulates them for user space.
 466 This can negatively impact performance.
 467 .TP
 468 .BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
 469 This is a placeholder event that counts nothing.
 470 Informational sample record types such as mmap or comm
 471 must be associated with an active event.
 472 This dummy event allows gathering such records without requiring
 473 a counting event.
 474 .RE
 475
 476 .RS
 477 If
 478 .I type
 479 is
 480 .BR PERF_TYPE_TRACEPOINT ,
 481 then we are measuring kernel tracepoints.
 482 The value to use in
 483 .I config
 484 can be obtained from under debugfs
 485 .I tracing/events/*/*/id
 486 if ftrace is enabled in the kernel.
 487 .RE
 488
 489 .RS
 490 If
 491 .I type
 492 is
 493 .BR PERF_TYPE_HW_CACHE ,
 494 then we are measuring a hardware CPU cache event.
 495 To calculate the appropriate
 496 .I config
 497 value use the following equation:
 498 .RS 4
 499 .nf
 500
 501     (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
 502     (perf_hw_cache_op_result_id << 16)
 503 .fi
 504 .P
 505 where
 506 .I perf_hw_cache_id
 507 is one of:
 508 .RS 4
 509 .TP
 510 .B PERF_COUNT_HW_CACHE_L1D
 511 for measuring Level 1 Data Cache
 512 .TP
 513 .B PERF_COUNT_HW_CACHE_L1I
 514 for measuring Level 1 Instruction Cache
 515 .TP
 516 .B PERF_COUNT_HW_CACHE_LL
 517 for measuring Last-Level Cache
 518 .TP
 519 .B PERF_COUNT_HW_CACHE_DTLB
 520 for measuring the Data TLB
 521 .TP
 522 .B PERF_COUNT_HW_CACHE_ITLB
 523 for measuring the Instruction TLB
 524 .TP
 525 .B PERF_COUNT_HW_CACHE_BPU
 526 for measuring the branch prediction unit
 527 .TP
 528 .BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.0)"
 529 for measuring local memory accesses
 530 .RE
 531 .P
 532 and
 533 .I perf_hw_cache_op_id
 534 is one of
 535 .RS 4
 536 .TP
 537 .B PERF_COUNT_HW_CACHE_OP_READ
 538 for read accesses
 539 .TP
 540 .B PERF_COUNT_HW_CACHE_OP_WRITE
 541 for write accesses
 542 .TP
 543 .B PERF_COUNT_HW_CACHE_OP_PREFETCH
 544 for prefetch accesses
 545 .RE
 546 .P
 547 and
 548 .I perf_hw_cache_op_result_id
 549 is one of
 550 .RS 4
 551 .TP
 552 .B PERF_COUNT_HW_CACHE_RESULT_ACCESS
 553 to measure accesses
 554 .TP
 555 .B PERF_COUNT_HW_CACHE_RESULT_MISS
 556 to measure misses
 557 .RE
 558 .RE
 559
 560 If
 561 .I type
 562 is
 563 .BR PERF_TYPE_RAW ,
 564 then a custom "raw"
 565 .I config
 566 value is needed.
 567 Most CPUs support events that are not covered by the "generalized" events.
 568 These are implementation defined; see your CPU manual (for example
 569 the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
 570 Guide).
 571 The libpfm4 library can be used to translate from the name in the
 572 architectural manuals to the raw hex value
 573 .BR perf_event_open ()
 574 expects in this field.
 575
 576 If
 577 .I type
 578 is
 579 .BR PERF_TYPE_BREAKPOINT ,
 580 then leave
 581 .I config
 582 set to zero.
 583 Its parameters are set in other places.
 584 .RE
 585 .TP
 586 .IR sample_period ", " sample_freq
 587 A "sampling" counter is one that generates an interrupt
 588 every N events, where N is given by
 589 .IR sample_period .
 590 A sampling counter has
 591 .IR sample_period " > 0."
 592 When an overflow interrupt occurs, requested data is recorded
 593 in the mmap buffer.
 594 The
 595 .I sample_type
 596 field controls what data is recorded on each interrupt.
 597
 598 .I sample_freq
 599 can be used if you wish to use frequency rather than period.
 600 In this case, you set the
 601 .I freq
 602 flag.
 603 The kernel will adjust the sampling period
 604 to try and achieve the desired rate.
 605 The rate of adjustment is a
 606 timer tick.
 607 .TP
 608 .I "sample_type"
 609 The various bits in this field specify which values to include
 610 in the sample.
 611 They will be recorded in a ring-buffer,
 612 which is available to user space using
 613 .BR mmap (2).
 614 The order in which the values are saved in the
 615 sample are documented in the MMAP Layout subsection below;
 616 it is not the
 617 .I "enum perf_event_sample_format"
 618 order.
 619 .RS
 620 .TP
 621 .B PERF_SAMPLE_IP
 622 Records instruction pointer.
 623 .TP
 624 .B PERF_SAMPLE_TID
 625 Records the process and thread IDs.
 626 .TP
 627 .B PERF_SAMPLE_TIME
 628 Records a timestamp.
 629 .TP
 630 .B PERF_SAMPLE_ADDR
 631 Records an address, if applicable.
 632 .TP
 633 .B PERF_SAMPLE_READ
 634 Record counter values for all events in a group, not just the group leader.
 635 .TP
 636 .B PERF_SAMPLE_CALLCHAIN
 637 Records the callchain (stack backtrace).
 638 .TP
 639 .B PERF_SAMPLE_ID
 640 Records a unique ID for the opened event's group leader.
 641 .TP
 642 .B PERF_SAMPLE_CPU
 643 Records CPU number.
 644 .TP
 645 .B PERF_SAMPLE_PERIOD
 646 Records the current sampling period.
 647 .TP
 648 .B PERF_SAMPLE_STREAM_ID
 649 Records a unique ID for the opened event.
 650 Unlike
 651 .B PERF_SAMPLE_ID
 652 the actual ID is returned, not the group leader.
 653 This ID is the same as the one returned by
 654 .BR PERF_FORMAT_ID .
 655 .TP
 656 .B PERF_SAMPLE_RAW
 657 Records additional data, if applicable.
 658 Usually returned by tracepoint events.
 659 .TP
 660 .BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
 661 This provides a record of recent branches, as provided
 662 by CPU branch sampling hardware (such as Intel Last Branch Record).
 663 Not all hardware supports this feature.
 664
 665 See the
 666 .I branch_sample_type
 667 field for how to filter which branches are reported.
 668 .TP
 669 .BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
 670 Records the current user-level CPU register state
 671 (the values in the process before the kernel was called).
 672 .TP
 673 .BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
 674 Records the user level stack, allowing stack unwinding.
 675 .TP
 676 .BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
 677 Records a hardware provided weight value that expresses how
 678 costly the sampled event was.
 679 This allows the hardware to highlight expensive events in
 680 a profile.
 681 .TP
 682 .BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
 683 Records the data source: where in the memory hierarchy
 684 the data associated with the sampled instruction came from.
 685 This is only available if the underlying hardware
 686 supports this feature.
 687 .TP
 688 .BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
 689 Places the
 690 .B SAMPLE_ID
 691 value in a fixed position in the record,
 692 either at the beginning (for sample events) or at the end
 693 (if a non-sample event).
 694
 695 This was necessary because a sample stream may have
 696 records from various different event sources with different
 697 .I sample_type
 698 settings.
 699 Parsing the event stream properly was not possible because the
 700 format of the record was needed to find
 701 .BR SAMPLE_ID ,
 702 but
 703 the format could not be found without knowing what
 704 event the sample belonged to (causing a circular
 705 dependency).
 706
 707 This new
 708 .B PERF_SAMPLE_IDENTIFIER
 709 setting makes the event stream always parsable
 710 by putting
 711 .B SAMPLE_ID
 712 in a fixed location, even though
 713 it means having duplicate
 714 .B SAMPLE_ID
 715 values in records.
 716 .TP
 717 .BR PERF_SAMPLE_TRANSACTION " (Since Linux 3.13)"
 718 Records reasons for transactional memory abort events
 719 (for example, from Intel TSX transactional memory support).
 720
 721 The
 722 .I precise_ip
 723 setting must be greater than 0 and a transactional memory abort
 724 event must be measured or no values will be recorded.
 725 Also note that some perf_event measurements, such as sampled
 726 cycle counting, may cause extraneous aborts (by causing an
 727 interrupt during a transaction).
 728 .RE
 729 .TP
 730 .IR "read_format"
 731 This field specifies the format of the data returned by
 732 .BR read (2)
 733 on a
 734 .BR perf_event_open ()
 735 file descriptor.
 736 .RS
 737 .TP
 738 .B PERF_FORMAT_TOTAL_TIME_ENABLED
 739 Adds the 64-bit
 740 .I time_enabled
 741 field.
 742 This can be used to calculate estimated totals if
 743 the PMU is overcommitted and multiplexing is happening.
 744 .TP
 745 .B PERF_FORMAT_TOTAL_TIME_RUNNING
 746 Adds the 64-bit
 747 .I time_running
 748 field.
 749 This can be used to calculate estimated totals if
 750 the PMU is overcommitted and  multiplexing is happening.
 751 .TP
 752 .B PERF_FORMAT_ID
 753 Adds a 64-bit unique value that corresponds to the event group.
 754 .TP
 755 .B PERF_FORMAT_GROUP
 756 Allows all counter values in an event group to be read with one read.
 757 .RE
 758 .TP
 759 .IR "disabled"
 760 The
 761 .I disabled
 762 bit specifies whether the counter starts out disabled or enabled.
 763 If disabled, the event can later be enabled by
 764 .BR ioctl (2),
 765 .BR prctl (2),
 766 or
 767 .IR enable_on_exec .
 768
 769 When creating an event group, typically the group leader is initialized
 770 with
 771 .I disabled
 772 set to 1 and any child events are initialized with
 773 .I disabled
 774 set to 0.
 775 Despite
 776 .I disabled
 777 being 0, the child events will not start until the group leader
 778 is enabled.
 779 .TP
 780 .IR "inherit"
 781 The
 782 .I inherit
 783 bit specifies that this counter should count events of child
 784 tasks as well as the task specified.
 785 This applies only to new children, not to any existing children at
 786 the time the counter is created (nor to any new children of
 787 existing children).
 788
 789 Inherit does not work for some combinations of
 790 .IR read_format s,
 791 such as
 792 .BR PERF_FORMAT_GROUP .
 793 .TP
 794 .IR "pinned"
 795 The
 796 .I pinned
 797 bit specifies that the counter should always be on the CPU if at all
 798 possible.
 799 It applies only to hardware counters and only to group leaders.
 800 If a pinned counter cannot be put onto the CPU (e.g., because there are
 801 not enough hardware counters or because of a conflict with some other
 802 event), then the counter goes into an 'error' state, where reads
 803 return end-of-file (i.e.,
 804 .BR read (2)
 805 returns 0) until the counter is subsequently enabled or disabled.
 806 .TP
 807 .IR "exclusive"
 808 The
 809 .I exclusive
 810 bit specifies that when this counter's group is on the CPU,
 811 it should be the only group using the CPU's counters.
 812 In the future this may allow monitoring programs to
 813 support PMU features that need to run alone so that they do not
 814 disrupt other hardware counters.
 815
 816 Note that many unexpected situations may prevent events with the
 817 .I exclusive
 818 bit set from ever running.
 819 This includes any users running a system-wide
 820 measurement as well as any kernel use of the performance counters
 821 (including the commonly enabled NMI Watchdog Timer interface).
 822 .TP
 823 .IR "exclude_user"
 824 If this bit is set, the count excludes events that happen in user space.
 825 .TP
 826 .IR "exclude_kernel"
 827 If this bit is set, the count excludes events that happen in kernel-space.
 828 .TP
 829 .IR "exclude_hv"
 830 If this bit is set, the count excludes events that happen in the
 831 hypervisor.
 832 This is mainly for PMUs that have built-in support for handling this
 833 (such as POWER).
 834 Extra support is needed for handling hypervisor measurements on most
 835 machines.
 836 .TP
 837 .IR "exclude_idle"
 838 If set, don't count when the CPU is idle.
 839 .TP
 840 .IR "mmap"
 841 The
 842 .I mmap
 843 bit enables generation of
 844 .B PERF_RECORD_MMAP
 845 samples for every
 846 .BR mmap (2)
 847 call that has
 848 .B PROT_EXEC
 849 set.
 850 This allows tools to notice new executable code being mapped into
 851 a program (dynamic shared libraries for example)
 852 so that addresses can be mapped back to the original code.
 853 .TP
 854 .IR "comm"
 855 The
 856 .I comm
 857 bit enables tracking of process command name as modified by the
 858 .BR exec (2)
 859 and
 860 .BR prctl (PR_SET_NAME)
 861 system calls.
 862 Unfortunately for tools,
 863 there is no way to distinguish one system call versus the other.
 864 .TP
 865 .IR "freq"
 866 If this bit is set, then
 867 .I sample_frequency
 868 not
 869 .I sample_period
 870 is used when setting up the sampling interval.
 871 .TP
 872 .IR "inherit_stat"
 873 This bit enables saving of event counts on context switch for
 874 inherited tasks.
 875 This is meaningful only if the
 876 .I inherit
 877 field is set.
 878 .TP
 879 .IR "enable_on_exec"
 880 If this bit is set, a counter is automatically
 881 enabled after a call to
 882 .BR exec (2).
 883 .TP
 884 .IR "task"
 885 If this bit is set, then
 886 fork/exit notifications are included in the ring buffer.
 887 .TP
 888 .IR "watermark"
 889 If set, have a sampling interrupt happen when we cross the
 890 .I wakeup_watermark
 891 boundary.
 892 Otherwise interrupts happen after
 893 .I wakeup_events
 894 samples.
 895 .TP
 896 .IR "precise_ip" " (since Linux 2.6.35)"
 897 This controls the amount of skid.
 898 Skid is how many instructions
 899 execute between an event of interest happening and the kernel
 900 being able to stop and record the event.
 901 Smaller skid is
 902 better and allows more accurate reporting of which events
 903 correspond to which instructions, but hardware is often limited
 904 with how small this can be.
 905
 906 The values of this are the following:
 907 .RS
 908 .TP
 909 0 -
 910 .B SAMPLE_IP
 911 can have arbitrary skid.
 912 .TP
 913 1 -
 914 .B SAMPLE_IP
 915 must have constant skid.
 916 .TP
 917 2 -
 918 .B SAMPLE_IP
 919 requested to have 0 skid.
 920 .TP
 921 3 -
 922 .B SAMPLE_IP
 923 must have 0 skid.
 924 See also
 925 .BR PERF_RECORD_MISC_EXACT_IP .
 926 .RE
 927 .TP
 928 .IR "mmap_data" " (since Linux 2.6.36)"
 929 The counterpart of the
 930 .I mmap
 931 field.
 932 This enables generation of
 933 .B PERF_RECORD_MMAP
 934 samples for
 935 .BR mmap (2)
 936 calls that do not have
 937 .B PROT_EXEC
 938 set (for example data and SysV shared memory).
 939 .TP
 940 .IR "sample_id_all" " (since Linux 2.6.38)"
 941 If set, then TID, TIME, ID, STREAM_ID, and CPU can
 942 additionally be included in
 943 .RB non- PERF_RECORD_SAMPLE s
 944 if the corresponding
 945 .I sample_type
 946 is selected.
 947
 948 If
 949 .B PERF_SAMPLE_IDENTIFIER
 950 is specified, then an additional ID value is included
 951 as the last value to ease parsing the record stream.
 952 This may lead to the
 953 .I id
 954 value appearing twice.
 955
 956 The layout is described by this pseudo-structure:
 957 .in +4n
 958 .nf
 959 struct sample_id {
 960     { u32 pid, tid; } /* if PERF_SAMPLE_TID set        */
 961     { u64 time;     } /* if PERF_SAMPLE_TIME set       */
 962     { u64 id;       } /* if PERF_SAMPLE_ID set         */
 963     { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set  */
 964     { u32 cpu, res; } /* if PERF_SAMPLE_CPU set        */
 965     { u64 id;       } /* if PERF_SAMPLE_IDENTIFIER set */
 966 };
 967 .fi
 968 .TP
 969 .IR "exclude_host" " (since Linux 3.2)"
 970 Do not measure time spent in VM host.
 971 .TP
 972 .IR "exclude_guest" " (since Linux 3.2)"
 973 Do not measure time spent in VM guest.
 974 .TP
 975 .IR "exclude_callchain_kernel" " (since Linux 3.7)"
 976 Do not include kernel callchains.
 977 .TP
 978 .IR "exclude_callchain_user" " (since Linux 3.7)"
 979 Do not include user callchains.
 980 .TP
 981 .IR "wakeup_events" ", " "wakeup_watermark"
 982 This union sets how many samples
 983 .RI ( wakeup_events )
 984 or bytes
 985 .RI ( wakeup_watermark )
 986 happen before an overflow signal happens.
 987 Which one is used is selected by the
 988 .I watermark
 989 bitflag.
 990
 991 .I wakeup_events
 992 only counts
 993 .B PERF_RECORD_SAMPLE
 994 record types.
 995 To  receive a signal for every incoming
 996 .B PERF_RECORD
 997 type set
 998 .I wakeup_watermark
 999 to 1.
1000 .TP
1001 .IR "bp_type" " (since Linux 2.6.33)"
1002 This chooses the breakpoint type.
1003 It is one of:
1004 .RS
1005 .TP
1006 .BR HW_BREAKPOINT_EMPTY
1007 No breakpoint.
1008 .TP
1009 .BR HW_BREAKPOINT_R
1010 Count when we read the memory location.
1011 .TP
1012 .BR HW_BREAKPOINT_W
1013 Count when we write the memory location.
1014 .TP
1015 .BR HW_BREAKPOINT_RW
1016 Count when we read or write the memory location.
1017 .TP
1018 .BR HW_BREAKPOINT_X
1019 Count when we execute code at the memory location.
1020 .LP
1021 The values can be combined via a bitwise or, but the
1022 combination of
1023 .B HW_BREAKPOINT_R
1024 or
1025 .B HW_BREAKPOINT_W
1026 with
1027 .B HW_BREAKPOINT_X
1028 is not allowed.
1029 .RE
1030 .TP
1031 .IR "bp_addr" " (since Linux 2.6.33)"
1032 .I bp_addr
1033 address of the breakpoint.
1034 For execution breakpoints this is the memory address of the instruction
1035 of interest; for read and write breakpoints it is the memory address
1036 of the memory location of interest.
1037 .TP
1038 .IR "config1" " (since Linux 2.6.39)"
1039 .I config1
1040 is used for setting events that need an extra register or otherwise
1041 do not fit in the regular config field.
1042 Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
1043 on 3.3 and later kernels.
1044 .TP
1045 .IR "bp_len" " (since Linux 2.6.33)"
1046 .I bp_len
1047 is the length of the breakpoint being measured if
1048 .I type
1049 is
1050 .BR PERF_TYPE_BREAKPOINT .
1051 Options are
1052 .BR HW_BREAKPOINT_LEN_1 ,
1053 .BR HW_BREAKPOINT_LEN_2 ,
1054 .BR HW_BREAKPOINT_LEN_4 ,
1055 .BR HW_BREAKPOINT_LEN_8 .
1056 For an execution breakpoint, set this to
1057 .IR sizeof(long) .
1058 .TP
1059 .IR "config2" " (since Linux 2.6.39)"
1060
1061 .I config2
1062 is a further extension of the
1063 .I config1
1064 field.
1065 .TP
1066 .IR "branch_sample_type" " (since Linux 3.4)"
1067 If
1068 .B PERF_SAMPLE_BRANCH_STACK
1069 is enabled, then this specifies what branches to include
1070 in the branch record.
1071
1072 The first part of the value is the privilege level, which
1073 is a combination of one of the following values.
1074 If the user does not set privilege level explicitly, the kernel
1075 will use the event's privilege level.
1076 Event and branch privilege levels do not have to match.
1077 .RS
1078 .TP
1079 .B PERF_SAMPLE_BRANCH_USER
1080 Branch target is in user space.
1081 .TP
1082 .B PERF_SAMPLE_BRANCH_KERNEL
1083 Branch target is in kernel space.
1084 .TP
1085 .B PERF_SAMPLE_BRANCH_HV
1086 Branch target is in hypervisor.
1087 .TP
1088 .B PERF_SAMPLE_BRANCH_PLM_ALL
1089 A convenience value that is the three preceding values ORed together.
1090
1091 .P
1092 In addition to the privilege value, at least one or more of the
1093 following bits must be set.
1094
1095 .TP
1096 .B PERF_SAMPLE_BRANCH_ANY
1097 Any branch type.
1098 .TP
1099 .B PERF_SAMPLE_BRANCH_ANY_CALL
1100 Any call branch.
1101 .TP
1102 .B PERF_SAMPLE_BRANCH_ANY_RETURN
1103 Any return branch.
1104 .TP
1105 .B PERF_SAMPLE_BRANCH_IND_CALL
1106 Indirect calls.
1107 .TP
1108 .BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
1109 Transactional memory aborts.
1110 .TP
1111 .BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
1112 Branch in transactional memory transaction.
1113 .TP
1114 .BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
1115 Branch not in transactional memory transaction.
1116 .RE
1117
1118 .TP
1119 .IR "sample_regs_user" " (since Linux 3.7)"
1120 This bit mask defines the set of user CPU registers to dump on samples.
1121 The layout of the register mask is architecture-specific and
1122 described in the kernel header
1123 .IR arch/ARCH/include/uapi/asm/perf_regs.h .
1124 .TP
1125 .IR "sample_stack_user" " (since Linux 3.7)"
1126 This defines the size of the user stack to dump if
1127 .B PERF_SAMPLE_STACK_USER
1128 is specified.
1129 .SS Reading results
1130 Once a
1131 .BR perf_event_open ()
1132 file descriptor  has been opened, the values
1133 of the events can be read from the file descriptor.
1134 The values that are there are specified by the
1135 .I read_format
1136 field in the
1137 .I attr
1138 structure at open time.
1139
1140 If you attempt to read into a buffer that is not big enough to hold the
1141 data
1142 .B ENOSPC
1143 is returned
1144
1145 Here is the layout of the data returned by a read:
1146 .IP * 2
1147 If
1148 .B PERF_FORMAT_GROUP
1149 was specified to allow reading all events in a group at once:
1150
1151 .in +4n
1152 .nf
1153 struct read_format {
1154     u64 nr;            /* The number of events */
1155     u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1156     u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1157     struct
1158         u64 value;     /* The value of the event */
1159         u64 id;        /* if PERF_FORMAT_ID */
1160     } values[nr];
1161 };
1162 .fi
1163 .in
1164 .IP *
1165 If
1166 .B PERF_FORMAT_GROUP
1167 was
1168 .I not
1169 specified:
1170
1171 .in +4n
1172 .nf
1173 struct read_format {
1174     u64 value;         /* The value of the event */
1175     u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1176     u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1177     u64 id;            /* if PERF_FORMAT_ID */
1178 };
1179 .fi
1180 .in
1181 .PP
1182 The values read are as follows:
1183 .TP
1184 .I nr
1185 The number of events in this file descriptor.
1186 Only available if
1187 .B PERF_FORMAT_GROUP
1188 was specified.
1189 .TP
1190 .IR time_enabled ", " time_running
1191 Total time the event was enabled and running.
1192 Normally these are the same.
1193 If more events are started,
1194 then available counter slots on the PMU, then multiplexing
1195 happens and events run only part of the time.
1196 In that case, the
1197 .I time_enabled
1198 and
1199 .I time running
1200 values can be used to scale an estimated value for the count.
1201 .TP
1202 .I value
1203 An unsigned 64-bit value containing the counter result.
1204 .TP
1205 .I id
1206 A globally unique value for this particular event, only there if
1207 .B PERF_FORMAT_ID
1208 was specified in
1209 .IR read_format .
1210 .SS MMAP layout
1211 When using
1212 .BR perf_event_open ()
1213 in sampled mode, asynchronous events
1214 (like counter overflow or
1215 .B PROT_EXEC
1216 mmap tracking)
1217 are logged into a ring-buffer.
1218 This ring-buffer is created and accessed through
1219 .BR mmap (2).
1220
1221 The mmap size should be 1+2^n pages, where the first page is a
1222 metadata page
1223 .RI ( "struct perf_event_mmap_page" )
1224 that contains various
1225 bits of information such as where the ring-buffer head is.
1226
1227 Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1228 ring buffer when sampling even if you do not plan to access it.
1229
1230 The structure of the first metadata mmap page is as follows:
1231
1232 .in +4n
1233 .nf
1234 struct perf_event_mmap_page {
1235     __u32 version;          /* version number of this structure */
1236     __u32 compat_version;   /* lowest version this is compat with */
1237     __u32 lock;             /* seqlock for synchronization */
1238     __u32 index;            /* hardware counter identifier */
1239     __s64 offset;           /* add to hardware counter value */
1240     __u64 time_enabled;     /* time event active */
1241     __u64 time_running;     /* time event on CPU */
1242     union {
1243         __u64   capabilities;
1244         struct {
1245             __u64   cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1246                     cap_bit0_is_deprecated : 1,
1247                     cap_user_rdpmc         : 1,
1248                     cap_user_time          : 1,
1249                     cap_user_time_zero     : 1,
1250         };
1251     };
1252     __u16   pmc_width;
1253     __u16   time_shift;
1254     __u32   time_mult;
1255     __u64   time_offset;
1256     __u64   __reserved[120];   /* Pad to 1k */
1257     __u64   data_head;         /* head in the data section */
1258     __u64   data_tail;         /* user-space written tail */
1259 }
1260 .fi
1261 .in
1262
1263 The following looks at the fields in the
1264 .I perf_event_mmap_page
1265 structure in more detail:
1266 .TP
1267 .I version
1268 Version number of this structure.
1269 .TP
1270 .I compat_version
1271 The lowest version this is compatible with.
1272 .TP
1273 .I lock
1274 A seqlock for synchronization.
1275 .TP
1276 .I index
1277 A unique hardware counter identifier.
1278 .TP
1279 .I offset
1280 When using rdpmc for reads this offset value
1281 must be added to the one returned by rdpmc to get
1282 the current total event count.
1283 .TP
1284 .I time_enabled
1285 Time the event was active.
1286 .TP
1287 .I time_running
1288 Time the event was running.
1289 .TP
1290 .IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
1291 There was a bug in the definition of
1292 .I cap_usr_time
1293 and
1294 .I cap_usr_rdpmc
1295 from Linux 3.4 until Linux 3.11.
1296 Both bits were defined to point to the same location, so it was
1297 impossible to know if
1298 .I cap_usr_time
1299 or
1300 .I cap_usr_rdpmc
1301 were actually set.
1302
1303 Starting with 3.12 these are renamed to
1304 .I cap_bit0
1305 and you should use the new
1306 .I cap_user_time
1307 and
1308 .I cap_user_rdpmc
1309 fields instead.
1310
1311 .TP
1312 .IR cap_bit0_is_deprecated " (since Linux 3.12)"
1313 If set, this bit indicates that the kernel supports
1314 the properly separated
1315 .I cap_user_time
1316 and
1317 .I cap_user_rdpmc
1318 bits.
1319
1320 If not-set, it indicates an older kernel where
1321 .I cap_usr_time
1322 and
1323 .I cap_usr_rdpmc
1324 map to the same bit and thus both features should
1325 be used with caution.
1326
1327 .TP
1328 .IR cap_user_rdpmc " (since Linux 3.12)"
1329 If the hardware supports user-space read of performance counters
1330 without syscall (this is the "rdpmc" instruction on x86), then
1331 the following code can be used to do a read:
1332
1333 .in +4n
1334 .nf
1335 u32 seq, time_mult, time_shift, idx, width;
1336 u64 count, enabled, running;
1337 u64 cyc, time_offset;
1338
1339 do {
1340     seq = pc\->lock;
1341     barrier();
1342     enabled = pc\->time_enabled;
1343     running = pc\->time_running;
1344
1345     if (pc\->cap_usr_time && enabled != running) {
1346         cyc = rdtsc();
1347         time_offset = pc\->time_offset;
1348         time_mult   = pc\->time_mult;
1349         time_shift  = pc\->time_shift;
1350     }
1351
1352     idx = pc\->index;
1353     count = pc\->offset;
1354
1355     if (pc\->cap_usr_rdpmc && idx) {
1356         width = pc\->pmc_width;
1357         count += rdpmc(idx \- 1);
1358     }
1359
1360     barrier();
1361 } while (pc\->lock != seq);
1362 .fi
1363 .in
1364 .TP
1365 .I cap_user_time " (since Linux 3.12)"
1366 This bit indicates the hardware has a constant, nonstop
1367 timestamp counter (TSC on x86).
1368 .TP
1369 .IR cap_user_time_zero " (since Linux 3.12)"
1370 Indicates the presence of
1371 .I time_zero
1372 which allows mapping timestamp values to
1373 the hardware clock.
1374 .TP
1375 .I pmc_width
1376 If
1377 .IR cap_usr_rdpmc ,
1378 this field provides the bit-width of the value
1379 read using the rdpmc or equivalent instruction.
1380 This can be used to sign extend the result like:
1381
1382 .in +4n
1383 .nf
1384 pmc <<= 64 \- pmc_width;
1385 pmc >>= 64 \- pmc_width; // signed shift right
1386 count += pmc;
1387 .fi
1388 .in
1389 .TP
1390 .IR time_shift ", " time_mult ", " time_offset
1391
1392 If
1393 .IR cap_usr_time ,
1394 these fields can be used to compute the time
1395 delta since time_enabled (in nanoseconds) using rdtsc or similar.
1396 .nf
1397
1398     u64 quot, rem;
1399     u64 delta;
1400     quot = (cyc >> time_shift);
1401     rem = cyc & ((1 << time_shift) \- 1);
1402     delta = time_offset + quot * time_mult +
1403             ((rem * time_mult) >> time_shift);
1404 .fi
1405
1406 Where
1407 .IR time_offset ,
1408 .IR time_mult ,
1409 .IR time_shift ,
1410 and
1411 .IR cyc
1412 are read in the
1413 seqcount loop described above.
1414 This delta can then be added to
1415 enabled and possible running (if idx), improving the scaling:
1416 .nf
1417
1418     enabled += delta;
1419     if (idx)
1420         running += delta;
1421     quot = count / running;
1422     rem  = count % running;
1423     count = quot * enabled + (rem * enabled) / running;
1424 .fi
1425 .TP
1426 .IR time_zero " (since Linux 3.12)"
1427
1428 If
1429 .I cap_usr_time_zero
1430 is set, then the hardware clock (the TSC timestamp counter on x86)
1431 can be calculated from the
1432 .IR time_zero ", " time_mult ", and " time_shift " values:"
1433 .nf
1434     time = timestamp - time_zero;
1435     quot = time / time_mult;
1436     rem  = time % time_mult;
1437     cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1438 .fi
1439 And vice versa:
1440 .nf
1441     quot = cyc >> time_shift;
1442     rem  = cyc & ((1 << time_shift) - 1);
1443     timestamp = time_zero + quot * time_mult +
1444         ((rem * time_mult) >> time_shift);
1445 .fi
1446 .TP
1447 .I data_head
1448 This points to the head of the data section.
1449 The value continuously increases, it does not wrap.
1450 The value needs to be manually wrapped by the size of the mmap buffer
1451 before accessing the samples.
1452
1453 On SMP-capable platforms, after reading the data_head value,
1454 user space should issue an rmb().
1455 .TP
1456 .I data_tail
1457 When the mapping is
1458 .BR PROT_WRITE ,
1459 the
1460 .I data_tail
1461 value should be written by user space to reflect the last read data.
1462 In this case, the kernel will not overwrite unread data.
1463 .PP
1464 The following 2^n ring-buffer pages have the layout described below.
1465
1466 If
1467 .I perf_event_attr.sample_id_all
1468 is set, then all event types will
1469 have the sample_type selected fields related to where/when (identity)
1470 an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1471 .B PERF_RECORD_SAMPLE
1472 below, it will be stashed just after the
1473 .I perf_event_header
1474 and the fields already present for the existing
1475 fields, that  is, at the end of the payload.
1476 That way a newer perf.data
1477 file will be supported by older perf tools, with these new optional
1478 fields being ignored.
1479
1480 The mmap values start with a header:
1481
1482 .in +4n
1483 .nf
1484 struct perf_event_header {
1485     __u32   type;
1486     __u16   misc;
1487     __u16   size;
1488 };
1489 .fi
1490 .in
1491
1492 Below, we describe the
1493 .I perf_event_header
1494 fields in more detail.
1495 For ease of reading,
1496 the fields with shorter descriptions are presented first.
1497 .TP
1498 .I size
1499 This indicates the size of the record.
1500 .TP
1501 .I misc
1502 The
1503 .I misc
1504 field contains additional information about the sample.
1505
1506 The CPU mode can be determined from this value by masking with
1507 .B PERF_RECORD_MISC_CPUMODE_MASK
1508 and looking for one of the following (note these are not
1509 bit masks, only one can be set at a time):
1510 .RS
1511 .TP
1512 .B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1513 Unknown CPU mode.
1514 .TP
1515 .B PERF_RECORD_MISC_KERNEL
1516 Sample happened in the kernel.
1517 .TP
1518 .B PERF_RECORD_MISC_USER
1519 Sample happened in user code.
1520 .TP
1521 .B PERF_RECORD_MISC_HYPERVISOR
1522 Sample happened in the hypervisor.
1523 .TP
1524 .B PERF_RECORD_MISC_GUEST_KERNEL
1525 Sample happened in the guest kernel.
1526 .TP
1527 .B PERF_RECORD_MISC_GUEST_USER
1528 Sample happened in guest user code.
1529 .RE
1530
1531 .RS
1532 In addition, one of the following bits can be set:
1533 .TP
1534 .B PERF_RECORD_MISC_MMAP_DATA
1535 This is set when the mapping is not executable;
1536 otherwise the mapping is executable.
1537 .TP
1538 .B PERF_RECORD_MISC_EXACT_IP
1539 This indicates that the content of
1540 .B PERF_SAMPLE_IP
1541 points
1542 to the actual instruction that triggered the event.
1543 See also
1544 .IR perf_event_attr.precise_ip .
1545 .TP
1546 .B PERF_RECORD_MISC_EXT_RESERVED
1547 This indicates there is extended data available (currently not used).
1548 .RE
1549 .TP
1550 .I type
1551 The
1552 .I type
1553 value is one of the below.
1554 The values in the corresponding record (that follows the header)
1555 depend on the
1556 .I type
1557 selected as shown.
1558
1559 .RS
1560 .TP 4
1561 .B PERF_RECORD_MMAP
1562 The MMAP events record the
1563 .B PROT_EXEC
1564 mappings so that we can correlate
1565 user-space IPs to code.
1566 They have the following structure:
1567
1568 .in +4n
1569 .nf
1570 struct {
1571     struct perf_event_header header;
1572     u32    pid, tid;
1573     u64    addr;
1574     u64    len;
1575     u64    pgoff;
1576     char   filename[];
1577 };
1578 .fi
1579 .in
1580 .TP
1581 .B PERF_RECORD_LOST
1582 This record indicates when events are lost.
1583
1584 .in +4n
1585 .nf
1586 struct {
1587     struct perf_event_header header;
1588     u64 id;
1589     u64 lost;
1590     struct sample_id sample_id;
1591 };
1592 .fi
1593 .in
1594 .RS
1595 .TP
1596 .I id
1597 is the unique event ID for the samples that were lost.
1598 .TP
1599 .I lost
1600 is the number of events that were lost.
1601 .RE
1602 .TP
1603 .B PERF_RECORD_COMM
1604 This record indicates a change in the process name.
1605
1606 .in +4n
1607 .nf
1608 struct {
1609     struct perf_event_header header;
1610     u32 pid, tid;
1611     char comm[];
1612     struct sample_id sample_id;
1613 };
1614 .fi
1615 .in
1616 .TP
1617 .B PERF_RECORD_EXIT
1618 This record indicates a process exit event.
1619
1620 .in +4n
1621 .nf
1622 struct {
1623     struct perf_event_header header;
1624     u32 pid, ppid;
1625     u32 tid, ptid;
1626     u64 time;
1627     struct sample_id sample_id;
1628 };
1629 .fi
1630 .in
1631 .TP
1632 .BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1633 This record indicates a throttle/unthrottle event.
1634
1635 .in +4n
1636 .nf
1637 struct {
1638     struct perf_event_header header;
1639     u64 time;
1640     u64 id;
1641     u64 stream_id;
1642     struct sample_id sample_id;
1643 };
1644 .fi
1645 .in
1646 .TP
1647 .B PERF_RECORD_FORK
1648 This record indicates a fork event.
1649
1650 .in +4n
1651 .nf
1652 struct {
1653     struct perf_event_header header;
1654     u32 pid, ppid;
1655     u32 tid, ptid;
1656     u64 time;
1657     struct sample_id sample_id;
1658 };
1659 .fi
1660 .in
1661 .TP
1662 .B PERF_RECORD_READ
1663 This record indicates a read event.
1664
1665 .in +4n
1666 .nf
1667 struct {
1668     struct perf_event_header header;
1669     u32 pid, tid;
1670     struct read_format values;
1671     struct sample_id sample_id;
1672 };
1673 .fi
1674 .in
1675 .TP
1676 .B PERF_RECORD_SAMPLE
1677 This record indicates a sample.
1678
1679 .in +4n
1680 .nf
1681 struct {
1682     struct perf_event_header header;
1683     u64   sample_id;  /* if PERF_SAMPLE_IDENTIFIER */
1684     u64   ip;         /* if PERF_SAMPLE_IP */
1685     u32   pid, tid;   /* if PERF_SAMPLE_TID */
1686     u64   time;       /* if PERF_SAMPLE_TIME */
1687     u64   addr;       /* if PERF_SAMPLE_ADDR */
1688     u64   id;         /* if PERF_SAMPLE_ID */
1689     u64   stream_id;  /* if PERF_SAMPLE_STREAM_ID */
1690     u32   cpu, res;   /* if PERF_SAMPLE_CPU */
1691     u64   period;     /* if PERF_SAMPLE_PERIOD */
1692     struct read_format v; /* if PERF_SAMPLE_READ */
1693     u64   nr;         /* if PERF_SAMPLE_CALLCHAIN */
1694     u64   ips[nr];    /* if PERF_SAMPLE_CALLCHAIN */
1695     u32   size;       /* if PERF_SAMPLE_RAW */
1696     char  data[size]; /* if PERF_SAMPLE_RAW */
1697     u64   bnr;        /* if PERF_SAMPLE_BRANCH_STACK */
1698     struct perf_branch_entry lbr[bnr];
1699                       /* if PERF_SAMPLE_BRANCH_STACK */
1700     u64   abi;        /* if PERF_SAMPLE_REGS_USER */
1701     u64   regs[weight(mask)];
1702                       /* if PERF_SAMPLE_REGS_USER */
1703     u64   size;       /* if PERF_SAMPLE_STACK_USER */
1704     char  data[size]; /* if PERF_SAMPLE_STACK_USER */
1705     u64   dyn_size;   /* if PERF_SAMPLE_STACK_USER */
1706     u64   weight;     /* if PERF_SAMPLE_WEIGHT */
1707     u64   data_src;   /* if PERF_SAMPLE_DATA_SRC */
1708     u64   transaction;/* if PERF_SAMPLE_TRANSACTION */
1709 };
1710 .fi
1711 .RS 4
1712 .TP 4
1713 .I sample_id
1714 If
1715 .B PERF_SAMPLE_IDENTIFIER
1716 is enabled, a 64-bit unique ID is included.
1717 This is a duplication of the
1718 .B PERF_SAMPLE_ID
1719 .I id
1720 value, but included at the beginning of the sample
1721 so parsers can easily obtain the value.
1722 .TP
1723 .I ip
1724 If
1725 .B PERF_SAMPLE_IP
1726 is enabled, then a 64-bit instruction
1727 pointer value is included.
1728 .TP
1729 .IR pid ", " tid
1730 If
1731 .B PERF_SAMPLE_TID
1732 is enabled, then a 32-bit process ID
1733 and 32-bit thread ID are included.
1734 .TP
1735 .I time
1736 If
1737 .B PERF_SAMPLE_TIME
1738 is enabled, then a 64-bit timestamp
1739 is included.
1740 This is obtained via local_clock() which is a hardware timestamp
1741 if available and the jiffies value if not.
1742 .TP
1743 .I addr
1744 If
1745 .B PERF_SAMPLE_ADDR
1746 is enabled, then a 64-bit address is included.
1747 This is usually the address of a tracepoint,
1748 breakpoint, or software event; otherwise the value is 0.
1749 .TP
1750 .I id
1751 If
1752 .B PERF_SAMPLE_ID
1753 is enabled, a 64-bit unique ID is included.
1754 If the event is a member of an event group, the group leader ID is returned.
1755 This ID is the same as the one returned by
1756 .BR PERF_FORMAT_ID .
1757 .TP
1758 .I stream_id
1759 If
1760 .B PERF_SAMPLE_STREAM_ID
1761 is enabled, a 64-bit unique ID is included.
1762 Unlike
1763 .B PERF_SAMPLE_ID
1764 the actual ID is returned, not the group leader.
1765 This ID is the same as the one returned by
1766 .BR PERF_FORMAT_ID .
1767 .TP
1768 .IR cpu ", " res
1769 If
1770 .B PERF_SAMPLE_CPU
1771 is enabled, this is a 32-bit value indicating
1772 which CPU was being used, in addition to a reserved (unused)
1773 32-bit value.
1774 .TP
1775 .I period
1776 If
1777 .B PERF_SAMPLE_PERIOD
1778 is enabled, a 64-bit value indicating
1779 the current sampling period is written.
1780 .TP
1781 .I v
1782 If
1783 .B PERF_SAMPLE_READ
1784 is enabled, a structure of type read_format
1785 is included which has values for all events in the event group.
1786 The values included depend on the
1787 .I read_format
1788 value used at
1789 .BR perf_event_open ()
1790 time.
1791 .TP
1792 .IR nr ", " ips[nr]
1793 If
1794 .B PERF_SAMPLE_CALLCHAIN
1795 is enabled, then a 64-bit number is included
1796 which indicates how many following 64-bit instruction pointers will
1797 follow.
1798 This is the current callchain.
1799 .TP
1800 .IR size ", " data[size]
1801 If
1802 .B PERF_SAMPLE_RAW
1803 is enabled, then a 32-bit value indicating size
1804 is included followed by an array of 8-bit values of length size.
1805 The values are padded with 0 to have 64-bit alignment.
1806
1807 This RAW record data is opaque with respect to the ABI.
1808 The ABI doesn't make any promises with respect to the stability
1809 of its content, it may vary depending
1810 on event, hardware, and kernel version.
1811 .TP
1812 .IR bnr ", " lbr[bnr]
1813 If
1814 .B PERF_SAMPLE_BRANCH_STACK
1815 is enabled, then a 64-bit value indicating
1816 the number of records is included, followed by
1817 .I bnr
1818 .I perf_branch_entry
1819 structures which each include the fields:
1820 .RS
1821 .TP
1822 .I from
1823 This indicates the source instruction (may not be a branch).
1824 .TP
1825 .I to
1826 The branch target.
1827 .TP
1828 .I mispred
1829 The branch target was mispredicted.
1830 .TP
1831 .I predicted
1832 The branch target was predicted.
1833 .TP
1834 .IR in_tx " (since Linux 3.11)"
1835 The branch was in a transactional memory transaction.
1836 .TP
1837 .IR abort " (since Linux 3.11)"
1838 The branch was in an aborted transactional memory transaction.
1839
1840 .P
1841 The entries are from most to least recent, so the first entry
1842 has the most recent branch.
1843
1844 Support for
1845 .I mispred
1846 and
1847 .I predicted
1848 is optional; if not supported, both
1849 values will be 0.
1850
1851 The type of branches recorded is specified by the
1852 .I branch_sample_type
1853 field.
1854 .RE
1855
1856 .TP
1857 .IR abi ", " regs[weight(mask)]
1858 If
1859 .B PERF_SAMPLE_REGS_USER
1860 is enabled, then the user CPU registers are recorded.
1861
1862 The
1863 .I abi
1864 field is one of
1865 .BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
1866 .BR PERF_SAMPLE_REGS_ABI_64 .
1867
1868 The
1869 .I regs
1870 field is an array of the CPU registers that were specified by
1871 the
1872 .I sample_regs_user
1873 attr field.
1874 The number of values is the number of bits set in the
1875 .I sample_regs_user
1876 bit mask.
1877 .TP
1878 .IR size ", " data[size] ", " dyn_size
1879 If
1880 .B PERF_SAMPLE_STACK_USER
1881 is enabled, then record the user stack to enable backtracing.
1882 .I size
1883 is the size requested by the user in
1884 .I stack_user_size
1885 or else the maximum record size.
1886 .I data
1887 is the stack data.
1888 .I dyn_size
1889 is the amount of data actually dumped (can be less than
1890 .IR size ).
1891 .TP
1892 .I weight
1893 If
1894 .B PERF_SAMPLE_WEIGHT
1895 is enabled, then a 64-bit value provided by the hardware
1896 is recorded that indicates how costly the event was.
1897 This allows expensive events to stand out more clearly
1898 in profiles.
1899 .TP
1900 .I data_src
1901 If
1902 .B PERF_SAMPLE_DATA_SRC
1903 is enabled, then a 64-bit value is recorded that is made up of
1904 the following fields:
1905 .RS
1906 .TP 4
1907 .I mem_op
1908 Type of opcode, a bitwise combination of:
1909
1910 .PD 0
1911 .RS
1912 .TP 24
1913 .B PERF_MEM_OP_NA
1914 Not available
1915 .TP
1916 .B PERF_MEM_OP_LOAD
1917 Load instruction
1918 .TP
1919 .B PERF_MEM_OP_STORE
1920 Store instruction
1921 .TP
1922 .B PERF_MEM_OP_PFETCH
1923 Prefetch
1924 .TP
1925 .B PERF_MEM_OP_EXEC
1926 Executable code
1927 .RE
1928 .PD
1929 .TP
1930 .I mem_lvl
1931 Memory hierarchy level hit or miss, a bitwise combination of:
1932
1933 .PD 0
1934 .RS
1935 .TP 24
1936 .B PERF_MEM_LVL_NA
1937 Not available
1938 .TP
1939 .B PERF_MEM_LVL_HIT
1940 Hit
1941 .TP
1942 .B PERF_MEM_LVL_MISS
1943 Miss
1944 .TP
1945 .B PERF_MEM_LVL_L1
1946 Level 1 cache
1947 .TP
1948 .B PERF_MEM_LVL_LFB
1949 Line fill buffer
1950 .TP
1951 .B PERF_MEM_LVL_L2
1952 Level 2 cache
1953 .TP
1954 .B PERF_MEM_LVL_L3
1955 Level 3 cache
1956 .TP
1957 .B PERF_MEM_LVL_LOC_RAM
1958 Local DRAM
1959 .TP
1960 .B PERF_MEM_LVL_REM_RAM1
1961 Remote DRAM 1 hop
1962 .TP
1963 .B PERF_MEM_LVL_REM_RAM2
1964 Remote DRAM 2 hops
1965 .TP
1966 .B PERF_MEM_LVL_REM_CCE1
1967 Remote cache 1 hop
1968 .TP
1969 .B PERF_MEM_LVL_REM_CCE2
1970 Remote cache 2 hops
1971 .TP
1972 .B PERF_MEM_LVL_IO
1973 I/O memory
1974 .TP
1975 .B PERF_MEM_LVL_UNC
1976 Uncached memory
1977 .RE
1978 .PD
1979 .TP
1980 .I mem_snoop
1981 Snoop mode, a bitwise combination of:
1982
1983 .PD 0
1984 .RS
1985 .TP 24
1986 .B PERF_MEM_SNOOP_NA
1987 Not available
1988 .TP
1989 .B PERF_MEM_SNOOP_NONE
1990 No snoop
1991 .TP
1992 .B PERF_MEM_SNOOP_HIT
1993 Snoop hit
1994 .TP
1995 .B PERF_MEM_SNOOP_MISS
1996 Snoop miss
1997 .TP
1998 .B PERF_MEM_SNOOP_HITM
1999 Snoop hit modified
2000 .RE
2001 .PD
2002 .TP
2003 .I mem_lock
2004 Lock instruction, a bitwise combination of:
2005
2006 .PD 0
2007 .RS
2008 .TP 24
2009 .B PERF_MEM_LOCK_NA
2010 Not available
2011 .TP
2012 .B PERF_MEM_LOCK_LOCKED
2013 Locked transaction
2014 .RE
2015 .PD
2016 .TP
2017 .I mem_dtlb
2018 TLB access hit or miss, a bitwise combination of:
2019
2020 .PD 0
2021 .RS
2022 .TP 24
2023 .B PERF_MEM_TLB_NA
2024 Not available
2025 .TP
2026 .B PERF_MEM_TLB_HIT
2027 Hit
2028 .TP
2029 .B PERF_MEM_TLB_MISS
2030 Miss
2031 .TP
2032 .B PERF_MEM_TLB_L1
2033 Level 1 TLB
2034 .TP
2035 .B PERF_MEM_TLB_L2
2036 Level 2 TLB
2037 .TP
2038 .B PERF_MEM_TLB_WK
2039 Hardware walker
2040 .TP
2041 .B PERF_MEM_TLB_OS
2042 OS fault handler
2043 .RE
2044 .PD
2045 .RE
2046 .TP
2047 .I transaction
2048 If the
2049 .B PERF_SAMPLE_TRANSACTION
2050 flag is set, then a 64-bit field is recorded describing
2051 the sources of any transactional memory aborts.
2052
2053 The field is a bitwise combination of the following values:
2054 .RS
2055 .TP
2056 .B PERF_TXN_ELISION
2057 Abort from an elision type transaction (Intel-CPU-specific).
2058 .TP
2059 .B PERF_TXN_TRANSACTION
2060 Abort from a generic transaction.
2061 .TP
2062 .B PERF_TXN_SYNC
2063 Synchronous abort (related to the reported instruction).
2064 .TP
2065 .B PERF_TXN_ASYNC
2066 Asynchronous abort (not related to the reported instruction).
2067 .TP
2068 .B PERF_TXN_RETRY
2069 Retryable abort (retrying the transaction may have succeeded).
2070 .TP
2071 .B PERF_TXN_CONFLICT
2072 Abort due to memory conflicts with other threads.
2073 .TP
2074 .B PERF_TXN_CAPACITY_WRITE
2075 Abort due to write capacity overflow.
2076 .TP
2077 .B PERF_TXN_CAPACITY_READ
2078 Abort due to read capacity overflow.
2079 .RE
2080 .IP
2081 In addition, a user-specified abort code can be obtained from
2082 the high 32 bits of the field by shifting right by
2083 .B PERF_TXN_ABORT_SHIFT
2084 and masking with
2085 .BR PERF_TXN_ABORT_MASK .
2086 .RE
2087 .RE
2088 .SS Signal overflow
2089 Events can be set to deliver a signal when a threshold is crossed.
2090 The signal handler is set up using the
2091 .BR poll (2),
2092 .BR select (2),
2093 .BR epoll (2)
2094 and
2095 .BR fcntl (2),
2096 system calls.
2097
2098 To generate signals, sampling must be enabled
2099 .RI ( sample_period
2100 must have a nonzero value).
2101
2102 There are two ways to generate signals.
2103
2104 The first is to set a
2105 .I wakeup_events
2106 or
2107 .I wakeup_watermark
2108 value that will generate a signal if a certain number of samples
2109 or bytes have been written to the mmap ring buffer.
2110 In this case, a signal of type
2111 .B POLL_IN
2112 is sent.
2113
2114 The other way is by use of the
2115 .B PERF_EVENT_IOC_REFRESH
2116 ioctl.
2117 This ioctl adds to a counter that decrements each time the event overflows.
2118 When nonzero, a
2119 .B POLL_IN
2120 signal is sent on overflow, but
2121 once the value reaches 0, a signal is sent of type
2122 .B POLL_HUP
2123 and
2124 the underlying event is disabled.
2125
2126 Note: on newer kernels (definitely noticed with 3.2)
2127 .\" FIXME(Vince) : Find out when this was introduced
2128 a signal is provided for every overflow, even if
2129 .I wakeup_events
2130 is not set.
2131 .SS rdpmc instruction
2132 Starting with Linux 3.4 on x86, you can use the
2133 .I rdpmc
2134 instruction to get low-latency reads without having to enter the kernel.
2135 Note that using
2136 .I rdpmc
2137 is not necessarily faster than other methods for reading event values.
2138
2139 Support for this can be detected with the
2140 .I cap_usr_rdpmc
2141 field in the mmap page; documentation on how
2142 to calculate event values can be found in that section.
2143 .SS perf_event ioctl calls
2144 .PP
2145 Various ioctls act on
2146 .BR perf_event_open ()
2147 file descriptors
2148 .TP
2149 .B PERF_EVENT_IOC_ENABLE
2150 Enables the individual event or event group specified by the
2151 file descriptor argument.
2152
2153 If the
2154 .B PERF_IOC_FLAG_GROUP
2155 bit is set in the ioctl argument, then all events in a group are
2156 enabled, even if the event specified is not the group leader
2157 (but see BUGS).
2158 .TP
2159 .B PERF_EVENT_IOC_DISABLE
2160 Disables the individual counter or event group specified by the
2161 file descriptor argument.
2162
2163 Enabling or disabling the leader of a group enables or disables the
2164 entire group; that is, while the group leader is disabled, none of the
2165 counters in the group will count.
2166 Enabling or disabling a member of a group other than the leader
2167 affects only that counter; disabling a non-leader
2168 stops that counter from counting but doesn't affect any other counter.
2169
2170 If the
2171 .B PERF_IOC_FLAG_GROUP
2172 bit is set in the ioctl argument, then all events in a group are
2173 disabled, even if the event specified is not the group leader
2174 (but see BUGS).
2175 .TP
2176 .B PERF_EVENT_IOC_REFRESH
2177 Non-inherited overflow counters can use this
2178 to enable a counter for a number of overflows specified by the argument,
2179 after which it is disabled.
2180 Subsequent calls of this ioctl add the argument value to the current
2181 count.
2182 A signal with
2183 .B POLL_IN
2184 set will happen on each overflow until the
2185 count reaches 0; when that happens a signal with
2186 POLL_HUP
2187 set is sent and the event is disabled.
2188 Using an argument of 0 is considered undefined behavior.
2189 .TP
2190 .B PERF_EVENT_IOC_RESET
2191 Reset the event count specified by the
2192 file descriptor argument to zero.
2193 This resets only the counts; there is no way to reset the
2194 multiplexing
2195 .I time_enabled
2196 or
2197 .I time_running
2198 values.
2199
2200 If the
2201 .B PERF_IOC_FLAG_GROUP
2202 bit is set in the ioctl argument, then all events in a group are
2203 reset, even if the event specified is not the group leader
2204 (but see BUGS).
2205 .TP
2206 .B PERF_EVENT_IOC_PERIOD
2207 This updates the overflow period for the event.
2208
2209 Since Linux 3.7 (on ARM) and Linux 3.14 (all other architectures),
2210 the new period takes effect immediately.
2211 On older kernels, the new period did not take effect until
2212 after the next overflow.
2213
2214 The argument is a pointer to a 64-bit value containing the
2215 desired new period.
2216
2217 Prior to Linux 2.6.36 this ioctl always failed due to a bug
2218 in the kernel.
2219
2220 .TP
2221 .B PERF_EVENT_IOC_SET_OUTPUT
2222 This tells the kernel to report event notifications to the specified
2223 file descriptor rather than the default one.
2224 The file descriptors must all be on the same CPU.
2225
2226 The argument specifies the desired file descriptor, or \-1 if
2227 output should be ignored.
2228 .TP
2229 .BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
2230 This adds an ftrace filter to this event.
2231
2232 The argument is a pointer to the desired ftrace filter.
2233 .TP
2234 .BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
2235 Returns the event ID value for the given event fd.
2236
2237 The argument is a pointer to a 64-bit unsigned integer
2238 to hold the result.
2239 .SS Using prctl
2240 A process can enable or disable all the event groups that are
2241 attached to it using the
2242 .BR prctl (2)
2243 .B PR_TASK_PERF_EVENTS_ENABLE
2244 and
2245 .B PR_TASK_PERF_EVENTS_DISABLE
2246 operations.
2247 This applies to all counters on the current process, whether created by
2248 this process or by another, and does not affect any counters that this
2249 process has created on other processes.
2250 It enables or disables only
2251 the group leaders, not any other members in the groups.
2252 .SS perf_event related configuration files
2253 Files in
2254 .I /proc/sys/kernel/
2255 .RS 4
2256 .TP
2257 .I /proc/sys/kernel/perf_event_paranoid
2258
2259 The
2260 .I perf_event_paranoid
2261 file can be set to restrict access to the performance counters.
2262 .RS
2263 .IP 2 4
2264 only allow user-space measurements.
2265 .IP 1
2266 allow both kernel and user measurements (default).
2267 .IP 0
2268 allow access to CPU-specific data but not raw tracepoint samples.
2269 .IP \-1
2270 no restrictions.
2271 .RE
2272 .IP
2273 The existence of the
2274 .I perf_event_paranoid
2275 file is the official method for determining if a kernel supports
2276 .BR perf_event_open ().
2277 .TP
2278 .I /proc/sys/kernel/perf_event_max_sample_rate
2279
2280 This sets the maximum sample rate.
2281 Setting this too high can allow
2282 users to sample at a rate that impacts overall machine performance
2283 and potentially lock up the machine.
2284 The default value is
2285 100000 (samples per second).
2286 .TP
2287 .I /proc/sys/kernel/perf_event_mlock_kb
2288
2289 Maximum number of pages an unprivileged user can mlock (2) .
2290 The default is 516 (kB).
2291
2292 .RE
2293 Files in
2294 .I /sys/bus/event_source/devices/
2295 .RS 4
2296 Since Linux 2.6.34 the kernel supports having multiple PMUs
2297 available for monitoring.
2298 Information on how to program these PMUs can be found under
2299 .IR /sys/bus/event_source/devices/ .
2300 Each subdirectory corresponds to a different PMU.
2301 .TP
2302 .IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
2303 This contains an integer that can be used in the
2304 .I type
2305 field of perf_event_attr to indicate you wish to use this PMU.
2306 .TP
2307 .IR /sys/bus/event_source/devices/*/rdpmc " (since Linux 3.4)"
2308 If this file is 1, then direct user-space access to the
2309 performance counter registers is allowed via the rdpmc instruction.
2310 This can be disabled by echoing 0 to the file.
2311 .TP
2312 .IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
2313 This subdirectory contains information on the architecture-specific
2314 subfields available for programming the various
2315 .I config
2316 fields in the perf_event_attr struct.
2317
2318 The content of each file is the name of the config field, followed
2319 by a colon, followed by a series of integer bit ranges separated by
2320 commas.
2321 For example, the file
2322 .I event
2323 may contain the value
2324 .I config1:1,6-10,44
2325 which indicates that event is an attribute that occupies bits 1,6-10, and 44
2326 of perf_event_attr::config1.
2327 .TP
2328 .IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
2329 This subdirectory contains files with predefined events.
2330 The contents are strings describing the event settings
2331 expressed in terms of the fields found in the previously mentioned
2332 .I ./format/
2333 directory.
2334 These are not necessarily complete lists of all events supported by
2335 a PMU, but usually a subset of events deemed useful or interesting.
2336
2337 The content of each file is a list of attribute names
2338 separated by commas.
2339 Each entry has an optional value (either hex or decimal).
2340 If no value is specified, then it is assumed to be a single-bit
2341 field with a value of 1.
2342 An example entry may look like this:
2343 .IR event=0x2,inv,ldlat=3 .
2344 .TP
2345 .I /sys/bus/event_source/devices/*/uevent
2346 This file is the standard kernel device interface
2347 for injecting hotplug events.
2348 .TP
2349 .IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
2350 The
2351 .I cpumask
2352 file contains a comma-separated list of integers that
2353 indicate a representative CPU number for each socket (package)
2354 on the motherboard.
2355 This is needed when setting up uncore or northbridge events, as
2356 those PMUs present socket-wide events.
2357 .RE
2358 .SH RETURN VALUE
2359 .BR perf_event_open ()
2360 returns the new file descriptor, or \-1 if an error occurred
2361 (in which case,
2362 .I errno
2363 is set appropriately).
2364 .SH ERRORS
2365 .TP
2366 .B E2BIG
2367 Returned if the perf_event_attr
2368 .I size
2369 value is too small
2370 (smaller than
2371 .BR PERF_ATTR_SIZE_VER0 ),
2372 too big (larger than the page size),
2373 or larger than the kernel supports and the extra bytes are not zero.
2374 When
2375 .B E2BIG
2376 is returned, the perf_event_attr
2377 .I size
2378 field is overwritten by the kernel to be the size of the structure
2379 it was expecting.
2380 .TP
2381 .B EINVAL
2382 Returned if the specified event is not available.
2383 .TP
2384 .B ENOSPC
2385 Prior to Linux 3.3, if there was not enough room for the event,
2386 .B ENOSPC
2387 was returned.
2388 Linus did not like this, and this was changed to
2389 .BR EINVAL .
2390 .B ENOSPC
2391 is still returned if you try to read results into
2392 too small of a buffer.
2393 .SH VERSION
2394 .BR perf_event_open ()
2395 was introduced in Linux 2.6.31 but was called
2396 .BR perf_counter_open ().
2397 It was renamed in Linux 2.6.32.
2398 .SH CONFORMING TO
2399 This
2400 .BR perf_event_open ()
2401 system call Linux- specific
2402 and should not be used in programs intended to be portable.
2403 .SH NOTES
2404 Glibc does not provide a wrapper for this system call; call it using
2405 .BR syscall (2).
2406 See the example below.
2407
2408 The official way of knowing if
2409 .BR perf_event_open ()
2410 support is enabled is checking
2411 for the existence of the file
2412 .IR /proc/sys/kernel/perf_event_paranoid .
2413 .SH BUGS
2414 The
2415 .B F_SETOWN_EX
2416 option to
2417 .BR fcntl (2)
2418 is needed to properly get overflow signals in threads.
2419 This was introduced in Linux 2.6.32.
2420
2421 Prior to Linux 2.6.33 (at least for x86) the kernel did not check
2422 if events could be scheduled together until read time.
2423 The same happens on all known kernels if the NMI watchdog is enabled.
2424 This means to see if a given set of events works you have to
2425 .BR perf_event_open (),
2426 start, then read before you know for sure you
2427 can get valid measurements.
2428
2429 Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
2430 In that case, some events would silently return "0" if the kernel
2431 scheduled them in an improper counter slot.
2432
2433 Prior to Linux 2.6.34 there was a bug when multiplexing where the
2434 wrong results could be returned.
2435
2436 Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
2437 "inherit" is enabled and many threads are started.
2438
2439 Prior to Linux 2.6.35,
2440 .B PERF_FORMAT_GROUP
2441 did not work with attached processes.
2442
2443 In older Linux 2.6 versions,
2444 refreshing an event group leader refreshed all siblings,
2445 and refreshing with a parameter of 0 enabled infinite refresh.
2446 This behavior is unsupported and should not be relied on.
2447
2448 There is a bug in the kernel code between
2449 Linux 2.6.36 and Linux 3.0 that ignores the
2450 "watermark" field and acts as if a wakeup_event
2451 was chosen if the union has a
2452 nonzero value in it.
2453
2454 From Linux 2.6.31 to Linux 3.4, the
2455 .B PERF_IOC_FLAG_GROUP
2456 ioctl argument was broken and would repeatedly operate
2457 on the event specified rather than iterating across
2458 all sibling events in a group.
2459
2460 From Linux 3.4 to Linux 3.11, the mmap
2461 .I cap_usr_rdpmc
2462 and
2463 .I cap_usr_time
2464 bits mapped to the same location.
2465 Code should migrate to the new
2466 .I cap_user_rdpmc
2467 and
2468 .I cap_user_time
2469 fields instead.
2470
2471 Always double-check your results!
2472 Various generalized events have had wrong values.
2473 For example, retired branches measured
2474 the wrong thing on AMD machines until Linux 2.6.35.
2475 .SH EXAMPLE
2476 The following is a short example that measures the total
2477 instruction count of a call to
2478 .BR printf (3).
2479 .nf
2480
2481 #include <stdlib.h>
2482 #include <stdio.h>
2483 #include <unistd.h>
2484 #include <string.h>
2485 #include <sys/ioctl.h>
2486 #include <linux/perf_event.h>
2487 #include <asm/unistd.h>
2488
2489 static long
2490 perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
2491                 int cpu, int group_fd, unsigned long flags)
2492 {
2493     int ret;
2494
2495     ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
2496                    group_fd, flags);
2497     return ret;
2498 }
2499
2500 int
2501 main(int argc, char **argv)
2502 {
2503     struct perf_event_attr pe;
2504     long long count;
2505     int fd;
2506
2507     memset(&pe, 0, sizeof(struct perf_event_attr));
2508     pe.type = PERF_TYPE_HARDWARE;
2509     pe.size = sizeof(struct perf_event_attr);
2510     pe.config = PERF_COUNT_HW_INSTRUCTIONS;
2511     pe.disabled = 1;
2512     pe.exclude_kernel = 1;
2513     pe.exclude_hv = 1;
2514
2515     fd = perf_event_open(&pe, 0, \-1, \-1, 0);
2516     if (fd == \-1) {
2517        fprintf(stderr, "Error opening leader %llx\\n", pe.config);
2518        exit(EXIT_FAILURE);
2519     }
2520
2521     ioctl(fd, PERF_EVENT_IOC_RESET, 0);
2522     ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
2523
2524     printf("Measuring instruction count for this printf\\n");
2525
2526     ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
2527     read(fd, &count, sizeof(long long));
2528
2529     printf("Used %lld instructions\\n", count);
2530
2531     close(fd);
2532 }
2533 .fi
2534 .SH SEE ALSO
2535 .BR fcntl (2),
2536 .BR mmap (2),
2537 .BR open (2),
2538 .BR prctl (2),
2539 .BR read (2)
2540 .SH COLOPHON
2541 This page is part of release 3.64 of the Linux
2542 .I man-pages
2543 project.
2544 A description of the project,
2545 and information about reporting bugs,
2546 can be found at
2547 \%http://www.kernel.org/doc/man\-pages/.