OSDN Git Service

tracing: Have hwlat ts be first instance and record count of instances
authorSteven Rostedt (VMware) <rostedt@goodmis.org>
Wed, 12 Feb 2020 17:21:03 +0000 (12:21 -0500)
committerSteven Rostedt (VMware) <rostedt@goodmis.org>
Tue, 3 Mar 2020 22:33:43 +0000 (17:33 -0500)
The hwlat tracer runs a loop of width time during a given window. It then
reports the max latency over a given threshold and records a timestamp. But
this timestamp is the time after the width has finished, and not the time it
actually triggered.

Record the actual time when the latency was greater than the threshold as
well as the number of times it was greater in a given width per window.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Documentation/trace/ftrace.rst
kernel/trace/trace_entries.h
kernel/trace/trace_hwlat.c
kernel/trace/trace_output.c

index ff658e2..99a0890 100644 (file)
@@ -2126,6 +2126,8 @@ periodically make a CPU constantly busy with interrupts disabled.
   # cat trace
   # tracer: hwlat
   #
+  # entries-in-buffer/entries-written: 13/13   #P:8
+  #
   #                              _-----=> irqs-off
   #                             / _----=> need-resched
   #                            | / _---=> hardirq/softirq
@@ -2133,12 +2135,18 @@ periodically make a CPU constantly busy with interrupts disabled.
   #                            ||| /     delay
   #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
   #              | |       |   ||||       |         |
-             <...>-3638  [001] d... 19452.055471: #1     inner/outer(us):   12/14    ts:1499801089.066141940
-             <...>-3638  [003] d... 19454.071354: #2     inner/outer(us):   11/9     ts:1499801091.082164365
-             <...>-3638  [002] dn.. 19461.126852: #3     inner/outer(us):   12/9     ts:1499801098.138150062
-             <...>-3638  [001] d... 19488.340960: #4     inner/outer(us):    8/12    ts:1499801125.354139633
-             <...>-3638  [003] d... 19494.388553: #5     inner/outer(us):    8/12    ts:1499801131.402150961
-             <...>-3638  [003] d... 19501.283419: #6     inner/outer(us):    0/12    ts:1499801138.297435289 nmi-total:4 nmi-count:1
+             <...>-1729  [001] d...   678.473449: #1     inner/outer(us):   11/12    ts:1581527483.343962693 count:6
+             <...>-1729  [004] d...   689.556542: #2     inner/outer(us):   16/9     ts:1581527494.889008092 count:1
+             <...>-1729  [005] d...   714.756290: #3     inner/outer(us):   16/16    ts:1581527519.678961629 count:5
+             <...>-1729  [001] d...   718.788247: #4     inner/outer(us):    9/17    ts:1581527523.889012713 count:1
+             <...>-1729  [002] d...   719.796341: #5     inner/outer(us):   13/9     ts:1581527524.912872606 count:1
+             <...>-1729  [006] d...   844.787091: #6     inner/outer(us):    9/12    ts:1581527649.889048502 count:2
+             <...>-1729  [003] d...   849.827033: #7     inner/outer(us):   18/9     ts:1581527654.889013793 count:1
+             <...>-1729  [007] d...   853.859002: #8     inner/outer(us):    9/12    ts:1581527658.889065736 count:1
+             <...>-1729  [001] d...   855.874978: #9     inner/outer(us):    9/11    ts:1581527660.861991877 count:1
+             <...>-1729  [001] d...   863.938932: #10    inner/outer(us):    9/11    ts:1581527668.970010500 count:1 nmi-total:7 nmi-count:1
+             <...>-1729  [007] d...   878.050780: #11    inner/outer(us):    9/12    ts:1581527683.385002600 count:1 nmi-total:5 nmi-count:1
+             <...>-1729  [007] d...   886.114702: #12    inner/outer(us):    9/12    ts:1581527691.385001600 count:1
 
 
 The above output is somewhat the same in the header. All events will have
@@ -2148,7 +2156,7 @@ interrupts disabled 'd'. Under the FUNCTION title there is:
        This is the count of events recorded that were greater than the
        tracing_threshold (See below).
 
- inner/outer(us):   12/14
+ inner/outer(us):   11/11
 
       This shows two numbers as "inner latency" and "outer latency". The test
       runs in a loop checking a timestamp twice. The latency detected within
@@ -2156,11 +2164,15 @@ interrupts disabled 'd'. Under the FUNCTION title there is:
       after the previous timestamp and the next timestamp in the loop is
       the "outer latency".
 
- ts:1499801089.066141940
+ ts:1581527483.343962693
+
+      The absolute timestamp that the first latency was recorded in the window.
+
+ count:6
 
-      The absolute timestamp that the event happened.
+      The number of times a latency was detected during the window.
 
- nmi-total:4 nmi-count:1
+ nmi-total:7 nmi-count:1
 
       On architectures that support it, if an NMI comes in during the
       test, the time spent in NMI is reported in "nmi-total" (in
index f22746f..a523da0 100644 (file)
@@ -325,14 +325,16 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
                __field_desc(   long,   timestamp,      tv_nsec         )
                __field(        unsigned int,           nmi_count       )
                __field(        unsigned int,           seqnum          )
+               __field(        unsigned int,           count           )
        ),
 
-       F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
+       F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n",
                 __entry->seqnum,
                 __entry->tv_sec,
                 __entry->tv_nsec,
                 __entry->duration,
                 __entry->outer_duration,
+                __entry->count,
                 __entry->nmi_total_ts,
                 __entry->nmi_count)
 );
index a48808c..e2be7bb 100644 (file)
@@ -83,6 +83,7 @@ struct hwlat_sample {
        u64                     nmi_total_ts;   /* Total time spent in NMIs */
        struct timespec64       timestamp;      /* wall time */
        int                     nmi_count;      /* # NMIs during this sample */
+       int                     count;          /* # of iteratons over threash */
 };
 
 /* keep the global state somewhere. */
@@ -124,6 +125,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
        entry->timestamp                = sample->timestamp;
        entry->nmi_total_ts             = sample->nmi_total_ts;
        entry->nmi_count                = sample->nmi_count;
+       entry->count                    = sample->count;
 
        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit_nostack(buffer, event);
@@ -167,12 +169,14 @@ void trace_hwlat_callback(bool enter)
 static int get_sample(void)
 {
        struct trace_array *tr = hwlat_trace;
+       struct hwlat_sample s;
        time_type start, t1, t2, last_t2;
-       s64 diff, total, last_total = 0;
+       s64 diff, outer_diff, total, last_total = 0;
        u64 sample = 0;
        u64 thresh = tracing_thresh;
        u64 outer_sample = 0;
        int ret = -1;
+       unsigned int count = 0;
 
        do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
 
@@ -186,6 +190,7 @@ static int get_sample(void)
 
        init_time(last_t2, 0);
        start = time_get(); /* start timestamp */
+       outer_diff = 0;
 
        do {
 
@@ -194,14 +199,14 @@ static int get_sample(void)
 
                if (time_u64(last_t2)) {
                        /* Check the delta from outer loop (t2 to next t1) */
-                       diff = time_to_us(time_sub(t1, last_t2));
+                       outer_diff = time_to_us(time_sub(t1, last_t2));
                        /* This shouldn't happen */
-                       if (diff < 0) {
+                       if (outer_diff < 0) {
                                pr_err(BANNER "time running backwards\n");
                                goto out;
                        }
-                       if (diff > outer_sample)
-                               outer_sample = diff;
+                       if (outer_diff > outer_sample)
+                               outer_sample = outer_diff;
                }
                last_t2 = t2;
 
@@ -217,6 +222,12 @@ static int get_sample(void)
                /* This checks the inner loop (t1 to t2) */
                diff = time_to_us(time_sub(t2, t1));     /* current diff */
 
+               if (diff > thresh || outer_diff > thresh) {
+                       if (!count)
+                               ktime_get_real_ts64(&s.timestamp);
+                       count++;
+               }
+
                /* This shouldn't happen */
                if (diff < 0) {
                        pr_err(BANNER "time running backwards\n");
@@ -236,7 +247,6 @@ static int get_sample(void)
 
        /* If we exceed the threshold value, we have found a hardware latency */
        if (sample > thresh || outer_sample > thresh) {
-               struct hwlat_sample s;
                u64 latency;
 
                ret = 1;
@@ -249,9 +259,9 @@ static int get_sample(void)
                s.seqnum = hwlat_data.count;
                s.duration = sample;
                s.outer_duration = outer_sample;
-               ktime_get_real_ts64(&s.timestamp);
                s.nmi_total_ts = nmi_total_ts;
                s.nmi_count = nmi_count;
+               s.count = count;
                trace_hwlat_sample(&s);
 
                latency = max(sample, outer_sample);
index b490908..e25a7da 100644 (file)
@@ -1158,12 +1158,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
 
        trace_assign_type(field, entry);
 
-       trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld",
+       trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d",
                         field->seqnum,
                         field->duration,
                         field->outer_duration,
                         (long long)field->timestamp.tv_sec,
-                        field->timestamp.tv_nsec);
+                        field->timestamp.tv_nsec, field->count);
 
        if (field->nmi_count) {
                /*