original/man2/perf_event_open.2

   1 .\" Copyright (c) 2012, Vincent Weaver
   2 .\"
   3 .\" %%%LICENSE_START(GPLv2+_DOC_FULL)
   4 .\" This is free documentation; you can redistribute it and/or
   5 .\" modify it under the terms of the GNU General Public License as
   6 .\" published by the Free Software Foundation; either version 2 of
   7 .\" the License, or (at your option) any later version.
   8 .\"
   9 .\" The GNU General Public License's references to "object code"
  10 .\" and "executables" are to be interpreted as the output of any
  11 .\" document formatting or typesetting system, including
  12 .\" intermediate and printed output.
  13 .\"
  14 .\" This manual is distributed in the hope that it will be useful,
  15 .\" but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 .\" GNU General Public License for more details.
  18 .\"
  19 .\" You should have received a copy of the GNU General Public
  20 .\" License along with this manual; if not, see
  21 .\" <http://www.gnu.org/licenses/>.
  22 .\" %%%LICENSE_END
  23 .\"
  24 .\" This document is based on the perf_event.h header file, the
  25 .\" tools/perf/design.txt file, and a lot of bitter experience.
  26 .\"
  27 .TH PERF_EVENT_OPEN 2 2013-02-04 "Linux" "Linux Programmer's Manual"
  28 .SH NAME
  29 perf_event_open \- set up performance monitoring
  30 .SH SYNOPSIS
  31 .nf
  32 .B #include <linux/perf_event.h>
  33 .B #include <linux/hw_breakpoint.h>
  34 .sp
  35 .BI "int perf_event_open(struct perf_event_attr *" attr ,
  36 .BI "                    pid_t " pid ", int " cpu ", int " group_fd ,
  37 .BI "                    unsigned long " flags  );
  38 .fi
  39
  40 .IR Note :
  41 There is no glibc wrapper for this system call; see NOTES.
  42 .SH DESCRIPTION
  43 Given a list of parameters,
  44 .BR perf_event_open ()
  45 returns a file descriptor, for use in subsequent system calls
  46 .RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
  47 .PP
  48 A call to
  49 .BR perf_event_open ()
  50 creates a file descriptor that allows measuring performance
  51 information.
  52 Each file descriptor corresponds to one
  53 event that is measured; these can be grouped together
  54 to measure multiple events simultaneously.
  55 .PP
  56 Events can be enabled and disabled in two ways: via
  57 .BR ioctl (2)
  58 and via
  59 .BR prctl (2) .
  60 When an event is disabled it does not count or generate overflows but does
  61 continue to exist and maintain its count value.
  62 .PP
  63 Events come in two flavors: counting and sampled.
  64 A
  65 .I counting
  66 event is one that is used for counting the aggregate number of events
  67 that occur.
  68 In general, counting event results are gathered with a
  69 .BR read (2)
  70 call.
  71 A
  72 .I sampling
  73 event periodically writes measurements to a buffer that can then
  74 be accessed via
  75 .BR  mmap (2) .
  76 .SS Arguments
  77 .P
  78 The argument
  79 .I pid
  80 allows events to be attached to processes in various ways.
  81 If
  82 .I pid
  83 is 0, measurements happen on the current thread, if
  84 .I pid
  85 is greater than 0, the process indicated by
  86 .I pid
  87 is measured, and if
  88 .I pid
  89 is \-1, all processes are counted.
  90
  91 The
  92 .I cpu
  93 argument allows measurements to be specific to a CPU.
  94 If
  95 .I cpu
  96 is greater than or equal to 0,
  97 measurements are restricted to the specified CPU;
  98 if
  99 .I cpu
 100 is \-1, the events are measured on all CPUs.
 101 .P
 102 Note that the combination of
 103 .IR pid " == \-1"
 104 and
 105 .IR cpu " == \-1"
 106 is not valid.
 107 .P
 108 A
 109 .IR pid " > 0"
 110 and
 111 .IR cpu " == \-1"
 112 setting measures per-process and follows that process to whatever CPU the
 113 process gets scheduled to.
 114 Per-process events can be created by any user.
 115 .P
 116 A
 117 .IR pid " == \-1"
 118 and
 119 .IR cpu " >= 0"
 120 setting is per-CPU and measures all processes on the specified CPU.
 121 Per-CPU events need the
 122 .B CAP_SYS_ADMIN
 123 capability or a
 124 .I /proc/sys/kernel/perf_event_paranoid
 125 value of less than 1.
 126 .P
 127 The
 128 .I group_fd
 129 argument allows event groups to be created.
 130 An event group has one event which is the group leader.
 131 The leader is created first, with
 132 .IR group_fd " = \-1."
 133 The rest of the group members are created with subsequent
 134 .BR perf_event_open ()
 135 calls with
 136 .IR group_fd
 137 being set to the fd of the group leader.
 138 (A single event on its own is created with
 139 .IR group_fd " = \-1"
 140 and is considered to be a group with only 1 member.)
 141 An event group is scheduled onto the CPU as a unit: it will only
 142 be put onto the CPU if all of the events in the group can be put onto
 143 the CPU.
 144 This means that the values of the member events can be
 145 meaningfully compared, added, divided (to get ratios), etc., with each
 146 other, since they have counted events for the same set of executed
 147 instructions.
 148 .P
 149 The
 150 .I flags
 151 argument takes one of the following values:
 152 .TP
 153 .BR PERF_FLAG_FD_NO_GROUP
 154 .\" FIXME The following sentence is unclear
 155 This flag allows creating an event as part of an event group but
 156 having no group leader.
 157 It is unclear why this is useful.
 158 .\" FIXME So, why is it useful?
 159 .TP
 160 .BR PERF_FLAG_FD_OUTPUT
 161 This flag re-routes the output from an event to the group leader.
 162 .TP
 163 .BR PERF_FLAG_PID_CGROUP " (Since Linux 2.6.39)."
 164 This flag activates per-container system-wide monitoring.
 165 A container
 166 is an abstraction that isolates a set of resources for finer grain
 167 control (CPUs, memory, etc...).
 168 In this mode, the event is measured
 169 only if the thread running on the monitored CPU belongs to the designated
 170 container (cgroup).
 171 The cgroup is identified by passing a file descriptor
 172 opened on its directory in the cgroupfs filesystem.
 173 For instance, if the
 174 cgroup to monitor is called
 175 .IR test ,
 176 then a file descriptor opened on
 177 .I /dev/cgroup/test
 178 (assuming cgroupfs is mounted on
 179 .IR /dev/cgroup )
 180 must be passed as the
 181 .I pid
 182 parameter.
 183 cgroup monitoring is only available
 184 for system-wide events and may therefore require extra permissions.
 185 .P
 186 The
 187 .I perf_event_attr
 188 structure provides detailed configuration information
 189 for the event being created.
 190
 191 .in +4n
 192 .nf
 193 struct perf_event_attr {
 194     __u32     type;         /* Type of event */
 195     __u32     size;         /* Size of attribute structure */
 196     __u64     config;       /* Type-specific configuration */
 197
 198     union {
 199         __u64 sample_period;    /* Period of sampling */
 200         __u64 sample_freq;      /* Frequency of sampling */
 201     };
 202
 203     __u64     sample_type;  /* Specifies values included in sample */
 204     __u64     read_format;  /* Specifies values returned in read */
 205
 206     __u64     disabled       : 1,   /* off by default */
 207               inherit        : 1,   /* children inherit it */
 208               pinned         : 1,   /* must always be on PMU */
 209               exclusive      : 1,   /* only group on PMU */
 210               exclude_user   : 1,   /* don't count user */
 211               exclude_kernel : 1,   /* don't count kernel */
 212               exclude_hv     : 1,   /* don't count hypervisor */
 213               exclude_idle   : 1,   /* don't count when idle */
 214               mmap           : 1,   /* include mmap data */
 215               comm           : 1,   /* include comm data */
 216               freq           : 1,   /* use freq, not period */
 217               inherit_stat   : 1,   /* per task counts */
 218               enable_on_exec : 1,   /* next exec enables */
 219               task           : 1,   /* trace fork/exit */
 220               watermark      : 1,   /* wakeup_watermark */
 221               precise_ip     : 2,   /* skid constraint */
 222               mmap_data      : 1,   /* non-exec mmap data */
 223               sample_id_all  : 1,   /* sample_type all events */
 224               exclude_host   : 1,   /* don't count in host */
 225               exclude_guest  : 1,   /* don't count in guest */
 226               exclude_callchain_kernel : 1,
 227                                     /* exclude kernel callchains */
 228               exclude_callchain_user   : 1,
 229                                     /* exclude user callchains */
 230               __reserved_1   : 41;
 231
 232     union {
 233         __u32 wakeup_events;    /* wakeup every n events */
 234         __u32 wakeup_watermark; /* bytes before wakeup */
 235     };
 236
 237     __u32     bp_type;          /* breakpoint type */
 238
 239     union {
 240         __u64 bp_addr;          /* breakpoint address */
 241         __u64 config1;          /* extension of config */
 242     };
 243
 244     union {
 245         __u64 bp_len;           /* breakpoint length */
 246         __u64 config2;          /* extension of config1 */
 247     };
 248     __u64   branch_sample_type; /* enum perf_branch_sample_type */
 249     __u64   sample_regs_user;   /* user regs to dump on samples */
 250     __u32   sample_stack_user;  /* size of stack to dump on
 251                                    samples */
 252     __u32   __reserved_2;       /* Align to u64 */
 253
 254 };
 255 .fi
 256 .in
 257
 258 The fields of the
 259 .I perf_event_attr
 260 structure are described in more detail below:
 261 .TP
 262 .I type
 263 This field specifies the overall event type.
 264 It has one of the following values:
 265 .RS
 266 .TP
 267 .B PERF_TYPE_HARDWARE
 268 This indicates one of the "generalized" hardware events provided
 269 by the kernel.
 270 See the
 271 .I config
 272 field definition for more details.
 273 .TP
 274 .B PERF_TYPE_SOFTWARE
 275 This indicates one of the software-defined events provided by the kernel
 276 (even if no hardware support is available).
 277 .TP
 278 .B PERF_TYPE_TRACEPOINT
 279 This indicates a tracepoint
 280 provided by the kernel tracepoint infrastructure.
 281 .TP
 282 .B PERF_TYPE_HW_CACHE
 283 This indicates a hardware cache event.
 284 This has a special encoding, described in the
 285 .I config
 286 field definition.
 287 .TP
 288 .B PERF_TYPE_RAW
 289 This indicates a "raw" implementation-specific event in the
 290 .IR config " field."
 291 .TP
 292 .BR PERF_TYPE_BREAKPOINT " (Since Linux 2.6.33)"
 293 This indicates a hardware breakpoint as provided by the CPU.
 294 Breakpoints can be read/write accesses to an address as well as
 295 execution of an instruction address.
 296 .TP
 297 .RB "dynamic PMU"
 298 Since Linux 2.6.39,
 299 .BR perf_event_open ()
 300 can support multiple PMUs.
 301 To enable this, a value exported by the kernel can be used in the
 302 .I type
 303 field to indicate which PMU to use.
 304 The value to use can be found in the sysfs filesystem:
 305 there is a subdirectory per PMU instance under
 306 .IR /sys/bus/event_source/devices .
 307 In each sub-directory there is a
 308 .I type
 309 file whose content is an integer that can be used in the
 310 .I type
 311 field.
 312 For instance,
 313 .I /sys/bus/event_source/devices/cpu/type
 314 contains the value for the core CPU PMU, which is usually 4.
 315 .RE
 316 .TP
 317 .I "size"
 318 The size of the
 319 .I perf_event_attr
 320 structure for forward/backward compatibility.
 321 Set this using
 322 .I sizeof(struct perf_event_attr)
 323 to allow the kernel to see
 324 the struct size at the time of compilation.
 325
 326 The related define
 327 .B PERF_ATTR_SIZE_VER0
 328 is set to 64; this was the size of the first published struct.
 329 .B PERF_ATTR_SIZE_VER1
 330 is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
 331 .B PERF_ATTR_SIZE_VER2
 332 is 80 corresponding to the addition of branch sampling in Linux 3.4.
 333 .B PERF_ATR_SIZE_VER3
 334 is 96 corresponding to the addition
 335 of
 336 .I sample_regs_user
 337 and
 338 .I sample_stack_user
 339 in Linux 3.7.
 340 .TP
 341 .I "config"
 342 This specifies which event you want, in conjunction with
 343 the
 344 .I type
 345 field.
 346 The
 347 .IR config1 " and " config2
 348 fields are also taken into account in cases where 64 bits is not
 349 enough to fully specify the event.
 350 The encoding of these fields are event dependent.
 351
 352 The most significant bit (bit 63) of
 353 .I config
 354 signifies CPU-specific (raw) counter configuration data;
 355 if the most significant bit is unset, the next 7 bits are an event
 356 type and the rest of the bits are the event identifier.
 357
 358 There are various ways to set the
 359 .I config
 360 field that are dependent on the value of the previously
 361 described
 362 .I type
 363 field.
 364 What follows are various possible settings for
 365 .I config
 366 separated out by
 367 .IR type .
 368
 369 If
 370 .I type
 371 is
 372 .BR PERF_TYPE_HARDWARE ,
 373 we are measuring one of the generalized hardware CPU events.
 374 Not all of these are available on all platforms.
 375 Set
 376 .I config
 377 to one of the following:
 378 .RS 12
 379 .TP
 380 .B PERF_COUNT_HW_CPU_CYCLES
 381 Total cycles.
 382 Be wary of what happens during CPU frequency scaling
 383 .TP
 384 .B PERF_COUNT_HW_INSTRUCTIONS
 385 Retired instructions.
 386 Be careful, these can be affected by various
 387 issues, most notably hardware interrupt counts
 388 .TP
 389 .B PERF_COUNT_HW_CACHE_REFERENCES
 390 Cache accesses.
 391 Usually this indicates Last Level Cache accesses but this may
 392 vary depending on your CPU.
 393 This may include prefetches and coherency messages; again this
 394 depends on the design of your CPU.
 395 .TP
 396 .B PERF_COUNT_HW_CACHE_MISSES
 397 Cache misses.
 398 Usually this indicates Last Level Cache misses; this is intended to be
 399 used in conjunction with the
 400 .B PERF_COUNT_HW_CACHE_REFERENCES
 401 event to calculate cache miss rates.
 402 .TP
 403 .B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 404 Retired branch instructions.
 405 Prior to Linux 2.6.34, this used
 406 the wrong event on AMD processors.
 407 .TP
 408 .B PERF_COUNT_HW_BRANCH_MISSES
 409 Mispredicted branch instructions.
 410 .TP
 411 .B PERF_COUNT_HW_BUS_CYCLES
 412 Bus cycles, which can be different from total cycles.
 413 .TP
 414 .BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (Since Linux 3.0)"
 415 Stalled cycles during issue.
 416 .TP
 417 .BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND  " (Since Linux 3.0)"
 418 Stalled cycles during retirement.
 419 .TP
 420 .BR PERF_COUNT_HW_REF_CPU_CYCLES  " (Since Linux 3.3)"
 421 Total cycles; not affected by CPU frequency scaling.
 422 .RE
 423 .IP
 424 If
 425 .I type
 426 is
 427 .BR PERF_TYPE_SOFTWARE ,
 428 we are measuring software events provided by the kernel.
 429 Set
 430 .I config
 431 to one of the following:
 432 .RS 12
 433 .TP
 434 .B PERF_COUNT_SW_CPU_CLOCK
 435 This reports the CPU clock, a high-resolution per-CPU timer.
 436 .TP
 437 .B PERF_COUNT_SW_TASK_CLOCK
 438 This reports a clock count specific to the task that is running.
 439 .TP
 440 .B PERF_COUNT_SW_PAGE_FAULTS
 441 This reports the number of page faults.
 442 .TP
 443 .B PERF_COUNT_SW_CONTEXT_SWITCHES
 444 This counts context switches.
 445 Until Linux 2.6.34, these were all reported as user-space
 446 events, after that they are reported as happening in the kernel.
 447 .TP
 448 .B PERF_COUNT_SW_CPU_MIGRATIONS
 449 This reports the number of times the process
 450 has migrated to a new CPU.
 451 .TP
 452 .B PERF_COUNT_SW_PAGE_FAULTS_MIN
 453 This counts the number of minor page faults.
 454 These did not require disk I/O to handle.
 455 .TP
 456 .B PERF_COUNT_SW_PAGE_FAULTS_MAJ
 457 This counts the number of major page faults.
 458 These required disk I/O to handle.
 459 .TP
 460 .BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (Since Linux 2.6.33)"
 461 This counts the number of alignment faults.
 462 These happen when unaligned memory accesses happen; the kernel
 463 can handle these but it reduces performance.
 464 This only happens on some architectures (never on x86).
 465 .TP
 466 .BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
 467 This counts the number of emulation faults.
 468 The kernel sometimes traps on unimplemented instructions
 469 and emulates them for user space.
 470 This can negatively impact performance.
 471 .RE
 472
 473 .RS
 474 If
 475 .I type
 476 is
 477 .BR PERF_TYPE_TRACEPOINT ,
 478 then we are measuring kernel tracepoints.
 479 The value to use in
 480 .I config
 481 can be obtained from under debugfs
 482 .I tracing/events/*/*/id
 483 if ftrace is enabled in the kernel.
 484 .RE
 485
 486 .RS
 487 If
 488 .I type
 489 is
 490 .BR PERF_TYPE_HW_CACHE ,
 491 then we are measuring a hardware CPU cache event.
 492 To calculate the appropriate
 493 .I config
 494 value use the following equation:
 495 .RS 4
 496 .nf
 497
 498     (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
 499     (perf_hw_cache_op_result_id << 16)
 500 .fi
 501 .P
 502 where
 503 .I perf_hw_cache_id
 504 is one of:
 505 .RS 4
 506 .TP
 507 .B PERF_COUNT_HW_CACHE_L1D
 508 for measuring Level 1 Data Cache
 509 .TP
 510 .B PERF_COUNT_HW_CACHE_L1I
 511 for measuring Level 1 Instruction Cache
 512 .TP
 513 .B PERF_COUNT_HW_CACHE_LL
 514 for measuring Last-Level Cache
 515 .TP
 516 .B PERF_COUNT_HW_CACHE_DTLB
 517 for measuring the Data TLB
 518 .TP
 519 .B PERF_COUNT_HW_CACHE_ITLB
 520 for measuring the Instruction TLB
 521 .TP
 522 .B PERF_COUNT_HW_CACHE_BPU
 523 for measuring the branch prediction unit
 524 .TP
 525 .BR PERF_COUNT_HW_CACHE_NODE " (Since Linux 3.0)"
 526 for measuring local memory accesses
 527 .RE
 528 .P
 529 and
 530 .I perf_hw_cache_op_id
 531 is one of
 532 .RS 4
 533 .TP
 534 .B PERF_COUNT_HW_CACHE_OP_READ
 535 for read accesses
 536 .TP
 537 .B PERF_COUNT_HW_CACHE_OP_WRITE
 538 for write accesses
 539 .TP
 540 .B PERF_COUNT_HW_CACHE_OP_PREFETCH
 541 for prefetch accesses
 542 .RE
 543 .P
 544 and
 545 .I perf_hw_cache_op_result_id
 546 is one of
 547 .RS 4
 548 .TP
 549 .B PERF_COUNT_HW_CACHE_RESULT_ACCESS
 550 to measure accesses
 551 .TP
 552 .B PERF_COUNT_HW_CACHE_RESULT_MISS
 553 to measure misses
 554 .RE
 555 .RE
 556
 557 If
 558 .I type
 559 is
 560 .BR PERF_TYPE_RAW ,
 561 then a custom "raw"
 562 .I config
 563 value is needed.
 564 Most CPUs support events that are not covered by the "generalized" events.
 565 These are implementation defined; see your CPU manual (for example
 566 the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
 567 Guide).
 568 The libpfm4 library can be used to translate from the name in the
 569 architectural manuals to the raw hex value
 570 .BR perf_event_open ()
 571 expects in this field.
 572
 573 If
 574 .I type
 575 is
 576 .BR PERF_TYPE_BREAKPOINT ,
 577 then leave
 578 .I config
 579 set to zero.
 580 Its parameters are set in other places.
 581 .RE
 582 .TP
 583 .IR sample_period ", " sample_freq
 584 A "sampling" counter is one that generates an interrupt
 585 every N events, where N is given by
 586 .IR sample_period .
 587 A sampling counter has
 588 .IR sample_period " > 0."
 589 When an overflow interrupt occurs, requested data is recorded
 590 in the mmap buffer.
 591 The
 592 .I sample_type
 593 field controls what data is recorded on each interrupt.
 594
 595 .I sample_freq
 596 can be used if you wish to use frequency rather than period.
 597 In this case you set the
 598 .I freq
 599 flag.
 600 The kernel will adjust the sampling period
 601 to try and achieve the desired rate.
 602 The rate of adjustment is a
 603 timer tick.
 604 .TP
 605 .I "sample_type"
 606 The various bits in this field specify which values to include
 607 in the sample.
 608 They will be recorded in a ring-buffer,
 609 which is available to user space using
 610 .BR mmap (2).
 611 The order in which the values are saved in the
 612 sample are documented in the MMAP Layout subsection below;
 613 it is not the
 614 .I "enum perf_event_sample_format"
 615 order.
 616 .RS
 617 .TP
 618 .B PERF_SAMPLE_IP
 619 Records instruction pointer.
 620 .TP
 621 .B PERF_SAMPLE_TID
 622 Records the process and thread IDs.
 623 .TP
 624 .B PERF_SAMPLE_TIME
 625 Records a timestamp.
 626 .TP
 627 .B PERF_SAMPLE_ADDR
 628 Records an address, if applicable.
 629 .TP
 630 .B PERF_SAMPLE_READ
 631 Record counter values for all events in a group, not just the group leader.
 632 .TP
 633 .B PERF_SAMPLE_CALLCHAIN
 634 Records the callchain (stack backtrace).
 635 .TP
 636 .B PERF_SAMPLE_ID
 637 Records a unique ID for the opened event's group leader.
 638 .TP
 639 .B PERF_SAMPLE_CPU
 640 Records CPU number.
 641 .TP
 642 .B PERF_SAMPLE_PERIOD
 643 Records the current sampling period.
 644 .TP
 645 .B PERF_SAMPLE_STREAM_ID
 646 Records a unique ID for the opened event.
 647 Unlike
 648 .B PERF_SAMPLE_ID
 649 the actual ID is returned, not the group leader.
 650 This ID is the same as the one returned by PERF_FORMAT_ID.
 651 .TP
 652 .B PERF_SAMPLE_RAW
 653 Records additional data, if applicable.
 654 Usually returned by tracepoint events.
 655 .TP
 656 .BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
 657 Records the branch stack.
 658 See branch_sample_type.
 659 .TP
 660 .BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
 661 Records the current register state.
 662 .TP
 663 .BR PERF_SAMPLE_STACK_USER " (Since Linux 3.7)"
 664 [To be documented]
 665 .RE
 666 .TP
 667 .IR "read_format"
 668 This field specifies the format of the data returned by
 669 .BR read (2)
 670 on a
 671 .BR perf_event_open ()
 672 file descriptor.
 673 .RS
 674 .TP
 675 .B PERF_FORMAT_TOTAL_TIME_ENABLED
 676 Adds the 64-bit
 677 .I time_enabled
 678 field.
 679 This can be used to calculate estimated totals if
 680 the PMU is overcommitted and multiplexing is happening.
 681 .TP
 682 .B PERF_FORMAT_TOTAL_TIME_RUNNING
 683 Adds the 64-bit
 684 .I time_running
 685 field.
 686 This can be used to calculate estimated totals if
 687 the PMU is overcommitted and  multiplexing is happening.
 688 .TP
 689 .B PERF_FORMAT_ID
 690 Adds a 64-bit unique value that corresponds to the event group.
 691 .TP
 692 .B PERF_FORMAT_GROUP
 693 Allows all counter values in an event group to be read with one read.
 694 .RE
 695 .TP
 696 .IR "disabled"
 697 The
 698 .I disabled
 699 bit specifies whether the counter starts out disabled or enabled.
 700 If disabled, the event can later be enabled by
 701 .BR ioctl (2),
 702 .BR prctl (2),
 703 or
 704 .IR enable_on_exec .
 705 .TP
 706 .IR "inherit"
 707 The
 708 .I inherit
 709 bit specifies that this counter should count events of child
 710 tasks as well as the task specified.
 711 This only applies to new children, not to any existing children at
 712 the time the counter is created (nor to any new children of
 713 existing children).
 714
 715 Inherit does not work for some combinations of
 716 .IR read_format s,
 717 such as
 718 .BR PERF_FORMAT_GROUP .
 719 .TP
 720 .IR "pinned"
 721 The
 722 .I pinned
 723 bit specifies that the counter should always be on the CPU if at all
 724 possible.
 725 It only applies to hardware counters and only to group leaders.
 726 If a pinned counter cannot be put onto the CPU (e.g., because there are
 727 not enough hardware counters or because of a conflict with some other
 728 event), then the counter goes into an 'error' state, where reads
 729 return end-of-file (i.e.,
 730 .BR read (2)
 731 returns 0) until the counter is subsequently enabled or disabled.
 732 .TP
 733 .IR "exclusive"
 734 The
 735 .I exclusive
 736 bit specifies that when this counter's group is on the CPU,
 737 it should be the only group using the CPU's counters.
 738 In the future this may allow monitoring programs to
 739 support PMU features that need to run alone so that they do not
 740 disrupt other hardware counters.
 741 .TP
 742 .IR "exclude_user"
 743 If this bit is set, the count excludes events that happen in user space.
 744 .TP
 745 .IR "exclude_kernel"
 746 If this bit is set, the count excludes events that happen in kernel-space.
 747 .TP
 748 .IR "exclude_hv"
 749 If this bit is set, the count excludes events that happen in the
 750 hypervisor.
 751 This is mainly for PMUs that have built-in support for handling this
 752 (such as POWER).
 753 Extra support is needed for handling hypervisor measurements on most
 754 machines.
 755 .TP
 756 .IR "exclude_idle"
 757 If set, don't count when the CPU is idle.
 758 .TP
 759 .IR "mmap"
 760 The
 761 .I mmap
 762 bit enables recording of exec mmap events.
 763 .TP
 764 .IR "comm"
 765 The
 766 .I comm
 767 bit enables tracking of process command name as modified by the
 768 .IR exec (2)
 769 and
 770 .IR prctl (PR_SET_NAME)
 771 system calls.
 772 Unfortunately for tools,
 773 there is no way to distinguish one system call versus the other.
 774 .TP
 775 .IR "freq"
 776 If this bit is set, then
 777 .I sample_frequency
 778 not
 779 .I sample_period
 780 is used when setting up the sampling interval.
 781 .TP
 782 .IR "inherit_stat"
 783 This bit enables saving of event counts on context switch for
 784 inherited tasks.
 785 This is only meaningful if the
 786 .I inherit
 787 field is set.
 788 .TP
 789 .IR "enable_on_exec"
 790 If this bit is set, a counter is automatically
 791 enabled after a call to
 792 .BR exec (2).
 793 .TP
 794 .IR "task"
 795 If this bit is set, then
 796 fork/exit notifications are included in the ring buffer.
 797 .TP
 798 .IR "watermark"
 799 If set, have a sampling interrupt happen when we cross the
 800 .I wakeup_watermark
 801 boundary.
 802 Otherwise interrupts happen after
 803 .I wakeup_events
 804 samples.
 805 .TP
 806 .IR "precise_ip" " (Since Linux 2.6.35)"
 807 This controls the amount of skid.
 808 Skid is how many instructions
 809 execute between an event of interest happening and the kernel
 810 being able to stop and record the event.
 811 Smaller skid is
 812 better and allows more accurate reporting of which events
 813 correspond to which instructions, but hardware is often limited
 814 with how small this can be.
 815
 816 The values of this are the following:
 817 .RS
 818 .TP
 819 0 -
 820 .B SAMPLE_IP
 821 can have arbitrary skid
 822 .TP
 823 1 -
 824 .B SAMPLE_IP
 825 must have constant skid
 826 .TP
 827 2 -
 828 .B SAMPLE_IP
 829 requested to have 0 skid
 830 .TP
 831 3 -
 832 .B SAMPLE_IP
 833 must have 0 skid.
 834 See also
 835 .BR PERF_RECORD_MISC_EXACT_IP .
 836 .RE
 837 .TP
 838 .IR "mmap_data" " (Since Linux 2.6.36)"
 839 The counterpart of the
 840 .I mmap
 841 field, but enables including data mmap events
 842 in the ring-buffer.
 843 .TP
 844 .IR "sample_id_all" " (Since Linux 2.6.38)"
 845 If set, then TID, TIME, ID, CPU, and STREAM_ID can
 846 additionally be included in
 847 .RB non- PERF_RECORD_SAMPLE s
 848 if the corresponding
 849 .I sample_type
 850 is selected.
 851 .TP
 852 .IR "exclude_host" " (Since Linux 3.2)"
 853 Do not measure time spent in VM host
 854 .TP
 855 .IR "exclude_guest" " (Since Linux 3.2)"
 856 Do not measure time spent in VM guest
 857 .TP
 858 .IR "exclude_callchain_kernel" " (Since Linux 3.7)"
 859 Do not include kernel callchains.
 860 .TP
 861 .IR "exclude_callchain_user" " (Since Linux 3.7)"
 862 Do not include user callchains.
 863 .TP
 864 .IR "wakeup_events" ", " "wakeup_watermark"
 865 This union sets how many samples
 866 .RI ( wakeup_events )
 867 or bytes
 868 .RI ( wakeup_watermark )
 869 happen before an overflow signal happens.
 870 Which one is used is selected by the
 871 .I watermark
 872 bitflag.
 873 .TP
 874 .IR "bp_type" " (Since Linux 2.6.33)"
 875 This chooses the breakpoint type.
 876 It is one of:
 877 .RS
 878 .TP
 879 .BR HW_BREAKPOINT_EMPTY
 880 no breakpoint
 881 .TP
 882 .BR HW_BREAKPOINT_R
 883 count when we read the memory location
 884 .TP
 885 .BR HW_BREAKPOINT_W
 886 count when we write the memory location
 887 .TP
 888 .BR HW_BREAKPOINT_RW
 889 count when we read or write the memory location
 890 .TP
 891 .BR HW_BREAKPOINT_X
 892 count when we execute code at the memory location
 893 .LP
 894 The values can be combined via a bitwise or, but the
 895 combination of
 896 .B HW_BREAKPOINT_R
 897 or
 898 .B HW_BREAKPOINT_W
 899 with
 900 .B HW_BREAKPOINT_X
 901 is not allowed.
 902 .RE
 903 .TP
 904 .IR "bp_addr" " (Since Linux 2.6.33)"
 905 .I bp_addr
 906 address of the breakpoint.
 907 For execution breakpoints this is the memory address of the instruction
 908 of interest; for read and write breakpoints it is the memory address
 909 of the memory location of interest.
 910 .TP
 911 .IR "config1" " (Since Linux 2.6.39)"
 912 .I config1
 913 is used for setting events that need an extra register or otherwise
 914 do not fit in the regular config field.
 915 Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
 916 on 3.3 and later kernels.
 917 .TP
 918 .IR "bp_len" " (Since Linux 2.6.33)"
 919 .I bp_len
 920 is the length of the breakpoint being measured if
 921 .I type
 922 is
 923 .BR PERF_TYPE_BREAKPOINT .
 924 Options are
 925 .BR HW_BREAKPOINT_LEN_1 ,
 926 .BR HW_BREAKPOINT_LEN_2 ,
 927 .BR HW_BREAKPOINT_LEN_4 ,
 928 .BR HW_BREAKPOINT_LEN_8 .
 929 For an execution breakpoint, set this to
 930 .IR sizeof(long) .
 931 .TP
 932 .IR "config2" " (Since Linux 2.6.39)"
 933
 934 .I config2
 935 is a further extension of the
 936 .I config1
 937 field.
 938 .TP
 939 .IR "branch_sample_type" " (Since Linux 3.4)"
 940 This is used with the CPUs hardware branch sampling, if available.
 941 It can have one of the following values:
 942 .RS
 943 .TP
 944 .B PERF_SAMPLE_BRANCH_USER
 945 Branch target is in user space
 946 .TP
 947 .B PERF_SAMPLE_BRANCH_KERNEL
 948 Branch target is in kernel space
 949 .TP
 950 .B PERF_SAMPLE_BRANCH_HV
 951 Branch target is in hypervisor
 952 .TP
 953 .B PERF_SAMPLE_BRANCH_ANY
 954 Any branch type.
 955 .TP
 956 .B PERF_SAMPLE_BRANCH_ANY_CALL
 957 Any call branch
 958 .TP
 959 .B PERF_SAMPLE_BRANCH_ANY_RETURN
 960 Any return branch
 961 .TP
 962 .BR PERF_SAMPLE_BRANCH_IND_CALL
 963 Indirect calls
 964 .TP
 965 .BR PERF_SAMPLE_BRANCH_PLM_ALL
 966 User, kernel, and hv
 967 .RE
 968 .TP
 969 .IR "sample_regs_user" " (Since Linux 3.7)"
 970 This defines the set of user registers to dump on samples.
 971 See
 972 .\" FIXME: The following reference seems to be not quite right:
 973 .IR asm/perf_regs.h .
 974 .TP
 975 .IR "sample_stack_user" " (Since Linux 3.7)"
 976 This defines the size of the user stack to dump on samples.
 977 .SS Reading results
 978 Once a
 979 .BR perf_event_open ()
 980 file descriptor  has been opened, the values
 981 of the events can be read from the file descriptor.
 982 The values that are there are specified by the
 983 .I read_format
 984 field in the
 985 .I attr
 986 structure at open time.
 987
 988 If you attempt to read into a buffer that is not big enough to hold the
 989 data
 990 .B ENOSPC
 991 is returned
 992
 993 Here is the layout of the data returned by a read:
 994 .IP * 2
 995 If
 996 .B PERF_FORMAT_GROUP
 997 was specified to allow reading all events in a group at once:
 998
 999 .in +4n
1000 .nf
1001 struct read_format {
1002     u64 nr;            /* The number of events */
1003     u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1004     u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1005     struct
1006         u64 value;     /* The value of the event */
1007         u64 id;        /* if PERF_FORMAT_ID */
1008     } values[nr];
1009 };
1010 .fi
1011 .in
1012 .IP *
1013 If
1014 .B PERF_FORMAT_GROUP
1015 was
1016 .I not
1017 specified:
1018
1019 .in +4n
1020 .nf
1021 struct read_format {
1022     u64 value;         /* The value of the event */
1023     u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1024     u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1025     u64 id;            /* if PERF_FORMAT_ID */
1026 };
1027 .fi
1028 .in
1029 .PP
1030 The values read are as follows:
1031 .TP
1032 .I nr
1033 The number of events in this file descriptor.
1034 Only available if
1035 .B PERF_FORMAT_GROUP
1036 was specified.
1037 .TP
1038 .IR time_enabled ", " time_running
1039 Total time the event was enabled and running.
1040 Normally these are the same.
1041 If more events are started
1042 than available counter slots on the PMU, then multiplexing
1043 happens and events only run part of the time.
1044 In that case the
1045 .I time_enabled
1046 and
1047 .I time running
1048 values can be used to scale an estimated value for the count.
1049 .TP
1050 .I value
1051 An unsigned 64-bit value containing the counter result.
1052 .TP
1053 .I id
1054 A globally unique value for this particular event, only there if
1055 .B PERF_FORMAT_ID
1056 was specified in
1057 .IR read_format .
1058 .SS MMAP layout
1059 When using
1060 .BR perf_event_open ()
1061 in sampled mode, asynchronous events
1062 (like counter overflow or
1063 .B PROT_EXEC
1064 mmap tracking)
1065 are logged into a ring-buffer.
1066 This ring-buffer is created and accessed through
1067 .BR mmap (2).
1068
1069 The mmap size should be 1+2^n pages, where the first page is a
1070 metadata page
1071 .RI ( "struct perf_event_mmap_page" )
1072 that contains various
1073 bits of information such as where the ring-buffer head is.
1074
1075 Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1076 ring buffer when sampling even if you do not plan to access it.
1077
1078 The structure of the first metadata mmap page is as follows:
1079
1080 .in +4n
1081 .nf
1082 struct perf_event_mmap_page {
1083     __u32 version;          /* version number of this structure */
1084     __u32 compat_version;   /* lowest version this is compat with */
1085     __u32 lock;             /* seqlock for synchronization */
1086     __u32 index;            /* hardware counter identifier */
1087     __s64 offset;           /* add to hardware counter value */
1088     __u64 time_enabled;     /* time event active */
1089     __u64 time_running;     /* time event on CPU */
1090     union {
1091         __u64   capabilities;
1092         __u64   cap_usr_time  : 1,
1093                 cap_usr_rdpmc : 1,
1094     };
1095     __u16   pmc_width;
1096     __u16   time_shift;
1097     __u32   time_mult;
1098     __u64   time_offset;
1099     __u64   __reserved[120];   /* Pad to 1k */
1100     __u64   data_head;         /* head in the data section */
1101     __u64   data_tail;         /* user-space written tail */
1102 }
1103 .fi
1104 .in
1105
1106 The following looks at the fields in the
1107 .I perf_event_mmap_page
1108 structure in more detail:
1109 .TP
1110 .I version
1111 Version number of this structure.
1112 .TP
1113 .I compat_version
1114 The lowest version this is compatible with.
1115 .TP
1116 .I lock
1117 A seqlock for synchronization.
1118 .TP
1119 .I index
1120 A unique hardware counter identifier.
1121 .TP
1122 .I offset
1123 .\" FIXME clarify
1124 Add this to hardware counter value??
1125 .TP
1126 .I time_enabled
1127 Time the event was active.
1128 .TP
1129 .I time_running
1130 Time the event was running.
1131 .TP
1132 .I cap_usr_time
1133 User time capability
1134 .TP
1135 .I cap_usr_rdpmc
1136 If the hardware supports user-space read of performance counters
1137 without syscall (this is the "rdpmc" instruction on x86), then
1138 the following code can be used to do a read:
1139
1140 .in +4n
1141 .nf
1142 u32 seq, time_mult, time_shift, idx, width;
1143 u64 count, enabled, running;
1144 u64 cyc, time_offset;
1145 s64 pmc = 0;
1146
1147 do {
1148     seq = pc\->lock;
1149     barrier();
1150     enabled = pc\->time_enabled;
1151     running = pc\->time_running;
1152
1153     if (pc\->cap_usr_time && enabled != running) {
1154         cyc = rdtsc();
1155         time_offset = pc\->time_offset;
1156         time_mult   = pc\->time_mult;
1157         time_shift  = pc\->time_shift;
1158     }
1159
1160     idx = pc\->index;
1161     count = pc\->offset;
1162
1163     if (pc\->cap_usr_rdpmc && idx) {
1164         width = pc\->pmc_width;
1165         pmc = rdpmc(idx \- 1);
1166     }
1167
1168     barrier();
1169 } while (pc\->lock != seq);
1170 .fi
1171 .in
1172 .TP
1173 .I pmc_width
1174 If
1175 .IR cap_usr_rdpmc ,
1176 this field provides the bit-width of the value
1177 read using the rdpmc or equivalent instruction.
1178 This can be used to sign extend the result like:
1179
1180 .in +4n
1181 .nf
1182 pmc <<= 64 \- pmc_width;
1183 pmc >>= 64 \- pmc_width; // signed shift right
1184 count += pmc;
1185 .fi
1186 .in
1187 .TP
1188 .IR time_shift ", " time_mult ", " time_offset
1189
1190 If
1191 .IR cap_usr_time ,
1192 these fields can be used to compute the time
1193 delta since time_enabled (in nanoseconds) using rdtsc or similar.
1194 .nf
1195
1196     u64 quot, rem;
1197     u64 delta;
1198     quot = (cyc >> time_shift);
1199     rem = cyc & ((1 << time_shift) \- 1);
1200     delta = time_offset + quot * time_mult +
1201             ((rem * time_mult) >> time_shift);
1202 .fi
1203
1204 Where
1205 .IR time_offset ,
1206 .IR time_mult ,
1207 .IR time_shift ,
1208 and
1209 .IR cyc
1210 are read in the
1211 seqcount loop described above.
1212 This delta can then be added to
1213 enabled and possible running (if idx), improving the scaling:
1214 .nf
1215
1216     enabled += delta;
1217     if (idx)
1218         running += delta;
1219     quot = count / running;
1220     rem  = count % running;
1221     count = quot * enabled + (rem * enabled) / running;
1222 .fi
1223 .TP
1224 .I data_head
1225 This points to the head of the data section.
1226 The value continuously increases, it does not wrap.
1227 The value needs to be manually wrapped by the size of the mmap buffer
1228 before accessing the samples.
1229
1230 On SMP-capable platforms, after reading the data_head value,
1231 user space should issue an rmb().
1232 .TP
1233 .I data_tail;
1234 When the mapping is
1235 .BR PROT_WRITE ,
1236 the
1237 .I data_tail
1238 value should be written by user space to reflect the last read data.
1239 In this case the kernel will not over-write unread data.
1240 .PP
1241 The following 2^n ring-buffer pages have the layout described below.
1242
1243 If
1244 .I perf_event_attr.sample_id_all
1245 is set, then all event types will
1246 have the sample_type selected fields related to where/when (identity)
1247 an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1248 .B PERF_RECORD_SAMPLE
1249 below, it will be stashed just after the
1250 .I perf_event_header
1251 and the fields already present for the existing
1252 fields, i.e., at the end of the payload.
1253 That way a newer perf.data
1254 file will be supported by older perf tools, with these new optional
1255 fields being ignored.
1256
1257 The mmap values start with a header:
1258
1259 .in +4n
1260 .nf
1261 struct perf_event_header {
1262     __u32   type;
1263     __u16   misc;
1264     __u16   size;
1265 };
1266 .fi
1267 .in
1268
1269 Below, we describe the
1270 .I perf_event_header
1271 fields in more detail.
1272 .TP
1273 .I type
1274 The
1275 .I type
1276 value is one of the below.
1277 The values in the corresponding record (that follows the header)
1278 depend on the
1279 .I type
1280 selected as shown.
1281 .RS
1282 .TP 4
1283 .B PERF_RECORD_MMAP
1284 The MMAP events record the
1285 .B PROT_EXEC
1286 mappings so that we can correlate
1287 user-space IPs to code.
1288 They have the following structure:
1289
1290 .in +4n
1291 .nf
1292 struct {
1293     struct perf_event_header header;
1294     u32    pid, tid;
1295     u64    addr;
1296     u64    len;
1297     u64    pgoff;
1298     char   filename[];
1299 };
1300 .fi
1301 .in
1302 .TP
1303 .B PERF_RECORD_LOST
1304 This record indicates when events are lost.
1305
1306 .in +4n
1307 .nf
1308 struct {
1309     struct perf_event_header header;
1310     u64 id;
1311     u64 lost;
1312 };
1313 .fi
1314 .in
1315 .RS
1316 .TP
1317 .I id
1318 is the unique event ID for the samples that were lost.
1319 .TP
1320 .I lost
1321 is the number of events that were lost.
1322 .RE
1323 .TP
1324 .B PERF_RECORD_COMM
1325 This record indicates a change in the process name.
1326
1327 .in +4n
1328 .nf
1329 struct {
1330     struct perf_event_header header;
1331     u32 pid, tid;
1332     char comm[];
1333 };
1334 .fi
1335 .in
1336 .TP
1337 .B PERF_RECORD_EXIT
1338 This record indicates a process exit event.
1339
1340 .in +4n
1341 .nf
1342 struct {
1343     struct perf_event_header header;
1344     u32 pid, ppid;
1345     u32 tid, ptid;
1346     u64 time;
1347 };
1348 .fi
1349 .in
1350 .TP
1351 .BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1352 This record indicates a throttle/unthrottle event.
1353
1354 .in +4n
1355 .nf
1356 struct {
1357     struct perf_event_header header;
1358     u64 time;
1359     u64 id;
1360     u64 stream_id;
1361 };
1362 .fi
1363 .in
1364 .TP
1365 .B PERF_RECORD_FORK
1366 This record indicates a fork event.
1367
1368 .in +4n
1369 .nf
1370 struct {
1371     struct perf_event_header header;
1372     u32 pid, ppid;
1373     u32 tid, ptid;
1374     u64 time;
1375 };
1376 .fi
1377 .in
1378 .TP
1379 .B PERF_RECORD_READ
1380 This record indicates a read event.
1381
1382 .in +4n
1383 .nf
1384 struct {
1385     struct perf_event_header header;
1386     u32 pid, tid;
1387     struct read_format values;
1388 };
1389 .fi
1390 .in
1391 .TP
1392 .B PERF_RECORD_SAMPLE
1393 This record indicates a sample.
1394
1395 .in +4n
1396 .nf
1397 struct {
1398     struct perf_event_header header;
1399     u64   ip;         /* if PERF_SAMPLE_IP */
1400     u32   pid, tid;   /* if PERF_SAMPLE_TID */
1401     u64   time;       /* if PERF_SAMPLE_TIME */
1402     u64   addr;       /* if PERF_SAMPLE_ADDR */
1403     u64   id;         /* if PERF_SAMPLE_ID */
1404     u64   stream_id;  /* if PERF_SAMPLE_STREAM_ID */
1405     u32   cpu, res;   /* if PERF_SAMPLE_CPU */
1406     u64   period;     /* if PERF_SAMPLE_PERIOD */
1407     struct read_format v; /* if PERF_SAMPLE_READ */
1408     u64   nr;         /* if PERF_SAMPLE_CALLCHAIN */
1409     u64   ips[nr];    /* if PERF_SAMPLE_CALLCHAIN */
1410     u32   size;       /* if PERF_SAMPLE_RAW */
1411     char  data[size]; /* if PERF_SAMPLE_RAW */
1412     u64   bnr;        /* if PERF_SAMPLE_BRANCH_STACK */
1413     struct perf_branch_entry lbr[bnr];
1414                       /* if PERF_SAMPLE_BRANCH_STACK */
1415     u64   abi;        /* if PERF_SAMPLE_REGS_USER */
1416     u64   regs[weight(mask)];
1417                       /* if PERF_SAMPLE_REGS_USER */
1418     u64   size;       /* if PERF_SAMPLE_STACK_USER */
1419     char  data[size]; /* if PERF_SAMPLE_STACK_USER */
1420     u64   dyn_size;   /* if PERF_SAMPLE_STACK_USER */
1421 };
1422 .fi
1423 .RS
1424 .TP
1425 .I ip
1426 If
1427 .B PERF_SAMPLE_IP
1428 is enabled, then a 64-bit instruction
1429 pointer value is included.
1430 .TP
1431 .IR pid ", " tid
1432 If
1433 .B PERF_SAMPLE_TID
1434 is enabled, then a 32-bit process ID
1435 and 32-bit thread ID are included.
1436 .TP
1437 .I time
1438 If
1439 .B PERF_SAMPLE_TIME
1440 is enabled, then a 64-bit timestamp
1441 is included.
1442 This is obtained via local_clock() which is a hardware timestamp
1443 if available and the jiffies value if not.
1444 .TP
1445 .I addr
1446 If
1447 .B PERF_SAMPLE_ADDR
1448 is enabled, then a 64-bit address is included.
1449 This is usually the address of a tracepoint,
1450 breakpoint, or software event; otherwise the value is 0.
1451 .TP
1452 .I id
1453 If
1454 .B PERF_SAMPLE_ID
1455 is enabled, a 64-bit unique ID is included.
1456 If the event is a member of an event group, the group leader ID is returned.
1457 This ID is the same as the one returned by
1458 .BR PERF_FORMAT_ID .
1459 .TP
1460 .I stream_id
1461 If
1462 .B PERF_SAMPLE_STREAM_ID
1463 is enabled, a 64-bit unique ID is included.
1464 Unlike
1465 .B PERF_SAMPLE_ID
1466 the actual ID is returned, not the group leader.
1467 This ID is the same as the one returned by
1468 .BR PERF_FORMAT_ID .
1469 .TP
1470 .IR cpu ", " res
1471 If
1472 .B PERF_SAMPLE_CPU
1473 is enabled, this is a 32-bit value indicating
1474 which CPU was being used, in addition to a reserved (unused)
1475 32-bit value.
1476 .TP
1477 .I period
1478 If
1479 .B PERF_SAMPLE_PERIOD
1480 is enabled, a 64-bit value indicating
1481 the current sampling period is written.
1482 .TP
1483 .I v
1484 If
1485 .B PERF_SAMPLE_READ
1486 is enabled, a structure of type read_format
1487 is included which has values for all events in the event group.
1488 The values included depend on the
1489 .I read_format
1490 value used at
1491 .BR perf_event_open ()
1492 time.
1493 .TP
1494 .IR nr ", " ips[nr]
1495 If
1496 .B PERF_SAMPLE_CALLCHAIN
1497 is enabled, then a 64-bit number is included
1498 which indicates how many following 64-bit instruction pointers will
1499 follow.
1500 This is the current callchain.
1501 .TP
1502 .IR size ", " data[size]
1503 If
1504 .B PERF_SAMPLE_RAW
1505 is enabled, then a 32-bit value indicating size
1506 is included followed by an array of 8-bit values of length size.
1507 The values are padded with 0 to have 64-bit alignment.
1508
1509 This RAW record data is opaque with respect to the ABI.
1510 The ABI doesn't make any promises with respect to the stability
1511 of its content, it may vary depending
1512 on event, hardware, and kernel version.
1513 .TP
1514 .IR bnr ", " lbr[bnr]
1515 If
1516 .B PERF_SAMPLE_BRANCH_STACK
1517 is enabled, then a 64-bit value indicating
1518 the number of records is included, followed by
1519 .I bnr
1520 .I perf_branch_entry
1521 structures.
1522 These structures have from, to, and flags values indicating
1523 the from and to addresses from the branches on the callstack.
1524 .TP
1525 .IR abi ", " regs[weight(mask)]
1526 If
1527 .B PERF_SAMPLE_REGS_USER
1528 is enabled, then
1529 [to be documented].
1530
1531 The
1532 .I abi
1533 field is one of
1534 .BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
1535 .BR PERF_SAMPLE_REGS_ABI_64 .
1536 .TP
1537 .IR size ", " data[size] ", " dyn_size
1538 If
1539 .B PERF_SAMPLE_STACK_USER
1540 is enabled, then
1541 [to be documented].
1542 .RE
1543 .RE
1544 .TP
1545 .I misc
1546 The
1547 .I misc
1548 field contains additional information about the sample.
1549
1550 The CPU mode can be determined from this value by masking with
1551 .B PERF_RECORD_MISC_CPUMODE_MASK
1552 and looking for one of the following (note these are not
1553 bit masks, only one can be set at a time):
1554 .RS
1555 .TP
1556 .B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1557 Unknown CPU mode.
1558 .TP
1559 .B PERF_RECORD_MISC_KERNEL
1560 Sample happened in the kernel.
1561 .TP
1562 .B PERF_RECORD_MISC_USER
1563 Sample happened in user code.
1564 .TP
1565 .B PERF_RECORD_MISC_HYPERVISOR
1566 Sample happened in the hypervisor.
1567 .TP
1568 .B PERF_RECORD_MISC_GUEST_KERNEL
1569 Sample happened in the guest kernel.
1570 .TP
1571 .B PERF_RECORD_MISC_GUEST_USER
1572 Sample happened in guest user code.
1573 .RE
1574
1575 In addition, one of the following bits can be set:
1576 .RS
1577 .TP
1578 .B PERF_RECORD_MISC_EXACT_IP
1579 This indicates that the content of
1580 .B PERF_SAMPLE_IP
1581 points
1582 to the actual instruction that triggered the event.
1583 See also
1584 .IR perf_event_attr.precise_ip .
1585 .TP
1586 .B PERF_RECORD_MISC_EXT_RESERVED
1587 This indicates there is extended data available (currently not used).
1588 .TP
1589 .I size
1590 This indicates the size of the record.
1591 .RE
1592 .SS Signal overflow
1593 Events can be set to deliver a signal when a threshold is crossed.
1594 The signal handler is set up using the
1595 .BR poll (2),
1596 .BR select (2),
1597 .BR epoll (2)
1598 and
1599 .BR fcntl (2),
1600 system calls.
1601
1602 To generate signals, sampling must be enabled
1603 .RI ( sample_period
1604 must have a non-zero value).
1605
1606 There are two ways to generate signals.
1607
1608 The first is to set a
1609 .I wakeup_events
1610 or
1611 .I wakeup_watermark
1612 value that will generate a signal if a certain number of samples
1613 or bytes have been written to the mmap ring buffer.
1614 In this case a signal of type
1615 .B POLL_IN
1616 is sent.
1617
1618 The other way is by use of the
1619 .B PERF_EVENT_IOC_REFRESH
1620 ioctl.
1621 This ioctl adds to a counter that decrements each time the event overflows.
1622 When non-zero, a
1623 .B POLL_IN
1624 signal is sent on overflow, but
1625 once the value reaches 0, a signal is sent of type
1626 .B POLL_HUP
1627 and
1628 the underlying event is disabled.
1629
1630 Note: on newer kernels (definitely noticed with 3.2)
1631 .\" FIXME(Vince) : Find out when this was introduced
1632 a signal is provided for every overflow, even if
1633 .I wakeup_events
1634 is not set.
1635 .SS rdpmc instruction
1636 Starting with Linux 3.4 on x86, you can use the
1637 .I rdpmc
1638 instruction to get low-latency reads without having to enter the kernel.
1639 Note that using
1640 .I rdpmc
1641 is not necessarily faster than other methods for reading event values.
1642
1643 Support for this can be detected with the
1644 .I cap_usr_rdpmc
1645 field in the mmap page; documentation on how
1646 to calculate event values can be found in that section.
1647 .SS perf_event ioctl calls
1648 .PP
1649 Various ioctls act on
1650 .BR perf_event_open ()
1651 file descriptors
1652 .TP
1653 .B PERF_EVENT_IOC_ENABLE
1654 Enables the individual event or event group specified by the
1655 file descriptor argument.
1656
1657 The ioctl argument is ignored.
1658 .TP
1659 .B PERF_EVENT_IOC_DISABLE
1660 Disables the individual counter or event group specified by the
1661 file descriptor argument.
1662
1663 Enabling or disabling the leader of a group enables or disables the
1664 entire group; that is, while the group leader is disabled, none of the
1665 counters in the group will count.
1666 Enabling or disabling a member of a group other than the leader only
1667 affects that counter; disabling a non-leader
1668 stops that counter from counting but doesn't affect any other counter.
1669
1670 The ioctl argument is ignored.
1671 .TP
1672 .B PERF_EVENT_IOC_REFRESH
1673 Non-inherited overflow counters can use this
1674 to enable a counter for a number of overflows specified by the argument,
1675 after which it is disabled.
1676 Subsequent calls of this ioctl add the argument value to the current
1677 count.
1678 A signal with
1679 .B POLL_IN
1680 set will happen on each overflow until the
1681 count reaches 0; when that happens a signal with
1682 POLL_HUP
1683 set is sent and the event is disabled.
1684 Using an argument of 0 is considered undefined behavior.
1685 .TP
1686 .B PERF_EVENT_IOC_RESET
1687 Reset the event count specified by the
1688 file descriptor argumentto zero.
1689 This only resets the counts; there is no way to reset the
1690 multiplexing
1691 .I time_enabled
1692 or
1693 .I time_running
1694 values.
1695 When sent to a group leader, only
1696 the leader is reset (child events are not).
1697
1698 The ioctl argument is ignored.
1699 .TP
1700 .B PERF_EVENT_IOC_PERIOD
1701 IOC_PERIOD is the command to update the period; it
1702 does not update the current period but instead defers until next.
1703
1704 The argument is a pointer to a 64-bit value containing the
1705 desired new period.
1706 .TP
1707 .B PERF_EVENT_IOC_SET_OUTPUT
1708 This tells the kernel to report event notifications to the specified
1709 file descriptor rather than the default one.
1710 The file descriptors must all be on the same CPU.
1711
1712 The argument specifies the desired file descriptor, or \-1 if
1713 output should be ignored.
1714 .TP
1715 .BR PERF_EVENT_IOC_SET_FILTER " (Since Linux 2.6.33)"
1716 This adds an ftrace filter to this event.
1717
1718 The argument is a pointer to the desired ftrace filter.
1719 .SS Using prctl
1720 A process can enable or disable all the event groups that are
1721 attached to it using the
1722 .BR prctl (2)
1723 .B PR_TASK_PERF_EVENTS_ENABLE
1724 and
1725 .B PR_TASK_PERF_EVENTS_DISABLE
1726 operations.
1727 This applies to all counters on the current process, whether created by
1728 this process or by another, and does not affect any counters that this
1729 process has created on other processes.
1730 It only enables or disables
1731 the group leaders, not any other members in the groups.
1732 .SS perf_event related configuration files
1733 Files in
1734 .I /proc/sys/kernel/
1735 .RS 4
1736 .TP
1737 .I /proc/sys/kernel/perf_event_paranoid
1738
1739 The
1740 .I perf_event_paranoid
1741 file can be set to restrict access to the performance counters.
1742
1743 2 - only allow user-space measurements
1744
1745 1 - (default) allow both kernel and user measurements
1746
1747 0 - allow access to CPU-specific data but not raw tracepoint samples
1748
1749 \-1 - no restrictions
1750
1751 The existence of the
1752 .I perf_event_paranoid
1753 file is the official method for determining if a kernel supports
1754 .BR perf_event_open ().
1755 .TP
1756 .I /proc/sys/kernel/perf_event_max_sample_rate
1757
1758 This sets the maximum sample rate.
1759 Setting this too high can allow
1760 users to sample at a rate that impacts overall machine performance
1761 and potentially lock up the machine.
1762 The default value is
1763 100000 (samples per second).
1764 .TP
1765 .I /proc/sys/kernel/perf_event_mlock_kb
1766
1767 Maximum number of pages an unprivileged user can mlock (2) .
1768 The default is 516 (kB).
1769 .RE
1770 Files in
1771 .I /sys/bus/event_source/devices/
1772 .RS 4
1773 Since Linux 2.6.34 the kernel supports having multiple PMUs
1774 available for monitoring.
1775 Information on how to program these PMUs can be found under
1776 .IR /sys/bus/event_source/devices/ .
1777 Each subdirectory corresponds to a different PMU.
1778 .TP
1779 .I /sys/bus/event_source/devices/*/type
1780 This contains an integer that can be used in the
1781 .I type
1782 field of perf_event_attr to indicate you wish to use this PMU.
1783 .TP
1784 .I /sys/bus/event_source/devices/*/rdpmc
1785 [To be documented]
1786 .TP
1787 .I /sys/bus/event_source/devices/*/format/
1788 This sub-directory contains information on what bits in the
1789 .I config
1790 field of perf_event_attr correspond to.
1791 .TP
1792 .I /sys/bus/event_source/devices/*/events/
1793 This sub-directory contains files with pre-defined events.
1794 The contents are strings describing the event settings
1795 expressed in terms of the fields found in the
1796 .I ./format/
1797 directory.
1798 These are not necessarily complete lists of all events supported by
1799 a PMU, but usually a subset of events deemed useful or interesting.
1800 .TP
1801 .I /sys/bus/event_source/devices/*/uevent
1802 [To be documented]
1803 .RE
1804 .SH RETURN VALUE
1805 .BR perf_event_open ()
1806 returns the new file descriptor, or \-1 if an error occurred
1807 (in which case,
1808 .I errno
1809 is set appropriately).
1810 .SH ERRORS
1811 .TP
1812 .B EINVAL
1813 Returned if the specified event is not available.
1814 .TP
1815 .B ENOSPC
1816 Prior to Linux 3.3, if there was not enough room for the event,
1817 .B ENOSPC
1818 was returned.
1819 Linus did not like this, and this was changed to
1820 .BR EINVAL .
1821 .B ENOSPC
1822 is still returned if you try to read results into
1823 too small of a buffer.
1824 .SH VERSION
1825 .BR perf_event_open ()
1826 was introduced in Linux 2.6.31 but was called
1827 .BR perf_counter_open () .
1828 It was renamed in Linux 2.6.32.
1829 .SH CONFORMING TO
1830 This
1831 .BR perf_event_open ()
1832 system call Linux- specific
1833 and should not be used in programs intended to be portable.
1834 .SH NOTES
1835 Glibc does not provide a wrapper for this system call; call it using
1836 .BR syscall (2).
1837 See the example below.
1838
1839 The official way of knowing if
1840 .BR perf_event_open ()
1841 support is enabled is checking
1842 for the existence of the file
1843 .IR /proc/sys/kernel/perf_event_paranoid .
1844 .SH BUGS
1845 The
1846 .B F_SETOWN_EX
1847 option to
1848 .BR fcntl (2)
1849 is needed to properly get overflow signals in threads.
1850 This was introduced in Linux 2.6.32.
1851
1852 Prior to Linux 2.6.33 (at least for x86) the kernel did not check
1853 if events could be scheduled together until read time.
1854 The same happens on all known kernels if the NMI watchdog is enabled.
1855 This means to see if a given set of events works you have to
1856 .BR perf_event_open (),
1857 start, then read before you know for sure you
1858 can get valid measurements.
1859
1860 Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
1861 In that case, some events would silently return "0" if the kernel
1862 scheduled them in an improper counter slot.
1863
1864 Prior to Linux 2.6.34 there was a bug when multiplexing where the
1865 wrong results could be returned.
1866
1867 Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
1868 "inherit" is enabled and many threads are started.
1869
1870 Prior to Linux 2.6.35,
1871 .B PERF_FORMAT_GROUP
1872 did not work with attached processes.
1873
1874 In older Linux 2.6 versions,
1875 refreshing an event group leader refreshed all siblings,
1876 and refreshing with a parameter of 0 enabled infinite refresh.
1877 This behavior is unsupported and should not be relied on.
1878
1879 There is a bug in the kernel code between
1880 Linux 2.6.36 and Linux 3.0 that ignores the
1881 "watermark" field and acts as if a wakeup_event
1882 was chosen if the union has a
1883 non-zero value in it.
1884
1885 Always double-check your results!
1886 Various generalized events have had wrong values.
1887 For example, retired branches measured
1888 the wrong thing on AMD machines until Linux 2.6.35.
1889 .SH EXAMPLE
1890 The following is a short example that measures the total
1891 instruction count of a call to
1892 .BR printf (3).
1893 .nf
1894
1895 #include <stdlib.h>
1896 #include <stdio.h>
1897 #include <unistd.h>
1898 #include <string.h>
1899 #include <sys/ioctl.h>
1900 #include <linux/perf_event.h>
1901 #include <asm/unistd.h>
1902
1903 long
1904 perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
1905                 int cpu, int group_fd, unsigned long flags)
1906 {
1907     int ret;
1908
1909     ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
1910                    group_fd, flags);
1911     return ret;
1912 }
1913
1914 int
1915 main(int argc, char **argv)
1916 {
1917     struct perf_event_attr pe;
1918     long long count;
1919     int fd;
1920
1921     memset(&pe, 0, sizeof(struct perf_event_attr));
1922     pe.type = PERF_TYPE_HARDWARE;
1923     pe.size = sizeof(struct perf_event_attr);
1924     pe.config = PERF_COUNT_HW_INSTRUCTIONS;
1925     pe.disabled = 1;
1926     pe.exclude_kernel = 1;
1927     pe.exclude_hv = 1;
1928
1929     fd = perf_event_open(&pe, 0, \-1, \-1, 0);
1930     if (fd == \-1) {
1931        fprintf(stderr, "Error opening leader %llx\\n", pe.config);
1932        exit(EXIT_FAILURE);
1933     }
1934
1935     ioctl(fd, PERF_EVENT_IOC_RESET, 0);
1936     ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
1937
1938     printf("Measuring instruction count for this printf\\n");
1939
1940     ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
1941     read(fd, &count, sizeof(long long));
1942
1943     printf("Used %lld instructions\\n", count);
1944
1945     close(fd);
1946 }
1947 .fi
1948 .SH SEE ALSO
1949 .BR fcntl (2),
1950 .BR mmap (2),
1951 .BR open (2),
1952 .BR prctl (2),
1953 .BR read (2)