OSDN Git Service

Merge branch 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / drivers / md / dm-stats.c
1 #include <linux/errno.h>
2 #include <linux/numa.h>
3 #include <linux/slab.h>
4 #include <linux/rculist.h>
5 #include <linux/threads.h>
6 #include <linux/preempt.h>
7 #include <linux/irqflags.h>
8 #include <linux/vmalloc.h>
9 #include <linux/mm.h>
10 #include <linux/module.h>
11 #include <linux/device-mapper.h>
12
13 #include "dm.h"
14 #include "dm-stats.h"
15
16 #define DM_MSG_PREFIX "stats"
17
18 static int dm_stat_need_rcu_barrier;
19
20 /*
21  * Using 64-bit values to avoid overflow (which is a
22  * problem that block/genhd.c's IO accounting has).
23  */
24 struct dm_stat_percpu {
25         unsigned long long sectors[2];
26         unsigned long long ios[2];
27         unsigned long long merges[2];
28         unsigned long long ticks[2];
29         unsigned long long io_ticks[2];
30         unsigned long long io_ticks_total;
31         unsigned long long time_in_queue;
32         unsigned long long *histogram;
33 };
34
35 struct dm_stat_shared {
36         atomic_t in_flight[2];
37         unsigned long long stamp;
38         struct dm_stat_percpu tmp;
39 };
40
41 struct dm_stat {
42         struct list_head list_entry;
43         int id;
44         unsigned stat_flags;
45         size_t n_entries;
46         sector_t start;
47         sector_t end;
48         sector_t step;
49         unsigned n_histogram_entries;
50         unsigned long long *histogram_boundaries;
51         const char *program_id;
52         const char *aux_data;
53         struct rcu_head rcu_head;
54         size_t shared_alloc_size;
55         size_t percpu_alloc_size;
56         size_t histogram_alloc_size;
57         struct dm_stat_percpu *stat_percpu[NR_CPUS];
58         struct dm_stat_shared stat_shared[0];
59 };
60
61 #define STAT_PRECISE_TIMESTAMPS         1
62
63 struct dm_stats_last_position {
64         sector_t last_sector;
65         unsigned last_rw;
66 };
67
68 /*
69  * A typo on the command line could possibly make the kernel run out of memory
70  * and crash. To prevent the crash we account all used memory. We fail if we
71  * exhaust 1/4 of all memory or 1/2 of vmalloc space.
72  */
73 #define DM_STATS_MEMORY_FACTOR          4
74 #define DM_STATS_VMALLOC_FACTOR         2
75
76 static DEFINE_SPINLOCK(shared_memory_lock);
77
78 static unsigned long shared_memory_amount;
79
80 static bool __check_shared_memory(size_t alloc_size)
81 {
82         size_t a;
83
84         a = shared_memory_amount + alloc_size;
85         if (a < shared_memory_amount)
86                 return false;
87         if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
88                 return false;
89 #ifdef CONFIG_MMU
90         if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
91                 return false;
92 #endif
93         return true;
94 }
95
96 static bool check_shared_memory(size_t alloc_size)
97 {
98         bool ret;
99
100         spin_lock_irq(&shared_memory_lock);
101
102         ret = __check_shared_memory(alloc_size);
103
104         spin_unlock_irq(&shared_memory_lock);
105
106         return ret;
107 }
108
109 static bool claim_shared_memory(size_t alloc_size)
110 {
111         spin_lock_irq(&shared_memory_lock);
112
113         if (!__check_shared_memory(alloc_size)) {
114                 spin_unlock_irq(&shared_memory_lock);
115                 return false;
116         }
117
118         shared_memory_amount += alloc_size;
119
120         spin_unlock_irq(&shared_memory_lock);
121
122         return true;
123 }
124
125 static void free_shared_memory(size_t alloc_size)
126 {
127         unsigned long flags;
128
129         spin_lock_irqsave(&shared_memory_lock, flags);
130
131         if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
132                 spin_unlock_irqrestore(&shared_memory_lock, flags);
133                 DMCRIT("Memory usage accounting bug.");
134                 return;
135         }
136
137         shared_memory_amount -= alloc_size;
138
139         spin_unlock_irqrestore(&shared_memory_lock, flags);
140 }
141
142 static void *dm_kvzalloc(size_t alloc_size, int node)
143 {
144         void *p;
145
146         if (!claim_shared_memory(alloc_size))
147                 return NULL;
148
149         if (alloc_size <= KMALLOC_MAX_SIZE) {
150                 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
151                 if (p)
152                         return p;
153         }
154         p = vzalloc_node(alloc_size, node);
155         if (p)
156                 return p;
157
158         free_shared_memory(alloc_size);
159
160         return NULL;
161 }
162
163 static void dm_kvfree(void *ptr, size_t alloc_size)
164 {
165         if (!ptr)
166                 return;
167
168         free_shared_memory(alloc_size);
169
170         kvfree(ptr);
171 }
172
173 static void dm_stat_free(struct rcu_head *head)
174 {
175         int cpu;
176         struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
177
178         kfree(s->program_id);
179         kfree(s->aux_data);
180         for_each_possible_cpu(cpu) {
181                 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
182                 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
183         }
184         dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
185         dm_kvfree(s, s->shared_alloc_size);
186 }
187
188 static int dm_stat_in_flight(struct dm_stat_shared *shared)
189 {
190         return atomic_read(&shared->in_flight[READ]) +
191                atomic_read(&shared->in_flight[WRITE]);
192 }
193
194 void dm_stats_init(struct dm_stats *stats)
195 {
196         int cpu;
197         struct dm_stats_last_position *last;
198
199         mutex_init(&stats->mutex);
200         INIT_LIST_HEAD(&stats->list);
201         stats->last = alloc_percpu(struct dm_stats_last_position);
202         for_each_possible_cpu(cpu) {
203                 last = per_cpu_ptr(stats->last, cpu);
204                 last->last_sector = (sector_t)ULLONG_MAX;
205                 last->last_rw = UINT_MAX;
206         }
207 }
208
209 void dm_stats_cleanup(struct dm_stats *stats)
210 {
211         size_t ni;
212         struct dm_stat *s;
213         struct dm_stat_shared *shared;
214
215         while (!list_empty(&stats->list)) {
216                 s = container_of(stats->list.next, struct dm_stat, list_entry);
217                 list_del(&s->list_entry);
218                 for (ni = 0; ni < s->n_entries; ni++) {
219                         shared = &s->stat_shared[ni];
220                         if (WARN_ON(dm_stat_in_flight(shared))) {
221                                 DMCRIT("leaked in-flight counter at index %lu "
222                                        "(start %llu, end %llu, step %llu): reads %d, writes %d",
223                                        (unsigned long)ni,
224                                        (unsigned long long)s->start,
225                                        (unsigned long long)s->end,
226                                        (unsigned long long)s->step,
227                                        atomic_read(&shared->in_flight[READ]),
228                                        atomic_read(&shared->in_flight[WRITE]));
229                         }
230                 }
231                 dm_stat_free(&s->rcu_head);
232         }
233         free_percpu(stats->last);
234 }
235
236 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
237                            sector_t step, unsigned stat_flags,
238                            unsigned n_histogram_entries,
239                            unsigned long long *histogram_boundaries,
240                            const char *program_id, const char *aux_data,
241                            void (*suspend_callback)(struct mapped_device *),
242                            void (*resume_callback)(struct mapped_device *),
243                            struct mapped_device *md)
244 {
245         struct list_head *l;
246         struct dm_stat *s, *tmp_s;
247         sector_t n_entries;
248         size_t ni;
249         size_t shared_alloc_size;
250         size_t percpu_alloc_size;
251         size_t histogram_alloc_size;
252         struct dm_stat_percpu *p;
253         int cpu;
254         int ret_id;
255         int r;
256
257         if (end < start || !step)
258                 return -EINVAL;
259
260         n_entries = end - start;
261         if (dm_sector_div64(n_entries, step))
262                 n_entries++;
263
264         if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
265                 return -EOVERFLOW;
266
267         shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
268         if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
269                 return -EOVERFLOW;
270
271         percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
272         if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
273                 return -EOVERFLOW;
274
275         histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
276         if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
277                 return -EOVERFLOW;
278
279         if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
280                                  num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
281                 return -ENOMEM;
282
283         s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
284         if (!s)
285                 return -ENOMEM;
286
287         s->stat_flags = stat_flags;
288         s->n_entries = n_entries;
289         s->start = start;
290         s->end = end;
291         s->step = step;
292         s->shared_alloc_size = shared_alloc_size;
293         s->percpu_alloc_size = percpu_alloc_size;
294         s->histogram_alloc_size = histogram_alloc_size;
295
296         s->n_histogram_entries = n_histogram_entries;
297         s->histogram_boundaries = kmemdup(histogram_boundaries,
298                                           s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
299         if (!s->histogram_boundaries) {
300                 r = -ENOMEM;
301                 goto out;
302         }
303
304         s->program_id = kstrdup(program_id, GFP_KERNEL);
305         if (!s->program_id) {
306                 r = -ENOMEM;
307                 goto out;
308         }
309         s->aux_data = kstrdup(aux_data, GFP_KERNEL);
310         if (!s->aux_data) {
311                 r = -ENOMEM;
312                 goto out;
313         }
314
315         for (ni = 0; ni < n_entries; ni++) {
316                 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
317                 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
318         }
319
320         if (s->n_histogram_entries) {
321                 unsigned long long *hi;
322                 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
323                 if (!hi) {
324                         r = -ENOMEM;
325                         goto out;
326                 }
327                 for (ni = 0; ni < n_entries; ni++) {
328                         s->stat_shared[ni].tmp.histogram = hi;
329                         hi += s->n_histogram_entries + 1;
330                 }
331         }
332
333         for_each_possible_cpu(cpu) {
334                 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
335                 if (!p) {
336                         r = -ENOMEM;
337                         goto out;
338                 }
339                 s->stat_percpu[cpu] = p;
340                 if (s->n_histogram_entries) {
341                         unsigned long long *hi;
342                         hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
343                         if (!hi) {
344                                 r = -ENOMEM;
345                                 goto out;
346                         }
347                         for (ni = 0; ni < n_entries; ni++) {
348                                 p[ni].histogram = hi;
349                                 hi += s->n_histogram_entries + 1;
350                         }
351                 }
352         }
353
354         /*
355          * Suspend/resume to make sure there is no i/o in flight,
356          * so that newly created statistics will be exact.
357          *
358          * (note: we couldn't suspend earlier because we must not
359          * allocate memory while suspended)
360          */
361         suspend_callback(md);
362
363         mutex_lock(&stats->mutex);
364         s->id = 0;
365         list_for_each(l, &stats->list) {
366                 tmp_s = container_of(l, struct dm_stat, list_entry);
367                 if (WARN_ON(tmp_s->id < s->id)) {
368                         r = -EINVAL;
369                         goto out_unlock_resume;
370                 }
371                 if (tmp_s->id > s->id)
372                         break;
373                 if (unlikely(s->id == INT_MAX)) {
374                         r = -ENFILE;
375                         goto out_unlock_resume;
376                 }
377                 s->id++;
378         }
379         ret_id = s->id;
380         list_add_tail_rcu(&s->list_entry, l);
381         mutex_unlock(&stats->mutex);
382
383         resume_callback(md);
384
385         return ret_id;
386
387 out_unlock_resume:
388         mutex_unlock(&stats->mutex);
389         resume_callback(md);
390 out:
391         dm_stat_free(&s->rcu_head);
392         return r;
393 }
394
395 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
396 {
397         struct dm_stat *s;
398
399         list_for_each_entry(s, &stats->list, list_entry) {
400                 if (s->id > id)
401                         break;
402                 if (s->id == id)
403                         return s;
404         }
405
406         return NULL;
407 }
408
409 static int dm_stats_delete(struct dm_stats *stats, int id)
410 {
411         struct dm_stat *s;
412         int cpu;
413
414         mutex_lock(&stats->mutex);
415
416         s = __dm_stats_find(stats, id);
417         if (!s) {
418                 mutex_unlock(&stats->mutex);
419                 return -ENOENT;
420         }
421
422         list_del_rcu(&s->list_entry);
423         mutex_unlock(&stats->mutex);
424
425         /*
426          * vfree can't be called from RCU callback
427          */
428         for_each_possible_cpu(cpu)
429                 if (is_vmalloc_addr(s->stat_percpu) ||
430                     is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
431                         goto do_sync_free;
432         if (is_vmalloc_addr(s) ||
433             is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
434 do_sync_free:
435                 synchronize_rcu_expedited();
436                 dm_stat_free(&s->rcu_head);
437         } else {
438                 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
439                 call_rcu(&s->rcu_head, dm_stat_free);
440         }
441         return 0;
442 }
443
444 static int dm_stats_list(struct dm_stats *stats, const char *program,
445                          char *result, unsigned maxlen)
446 {
447         struct dm_stat *s;
448         sector_t len;
449         unsigned sz = 0;
450
451         /*
452          * Output format:
453          *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
454          */
455
456         mutex_lock(&stats->mutex);
457         list_for_each_entry(s, &stats->list, list_entry) {
458                 if (!program || !strcmp(program, s->program_id)) {
459                         len = s->end - s->start;
460                         DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
461                                 (unsigned long long)s->start,
462                                 (unsigned long long)len,
463                                 (unsigned long long)s->step,
464                                 s->program_id,
465                                 s->aux_data);
466                 }
467         }
468         mutex_unlock(&stats->mutex);
469
470         return 1;
471 }
472
473 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
474                           struct dm_stat_percpu *p)
475 {
476         /*
477          * This is racy, but so is part_round_stats_single.
478          */
479         unsigned long long now, difference;
480         unsigned in_flight_read, in_flight_write;
481
482         if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
483                 now = jiffies;
484         else
485                 now = ktime_to_ns(ktime_get());
486
487         difference = now - shared->stamp;
488         if (!difference)
489                 return;
490
491         in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
492         in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
493         if (in_flight_read)
494                 p->io_ticks[READ] += difference;
495         if (in_flight_write)
496                 p->io_ticks[WRITE] += difference;
497         if (in_flight_read + in_flight_write) {
498                 p->io_ticks_total += difference;
499                 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
500         }
501         shared->stamp = now;
502 }
503
504 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
505                               unsigned long bi_rw, sector_t len,
506                               struct dm_stats_aux *stats_aux, bool end,
507                               unsigned long duration_jiffies)
508 {
509         unsigned long idx = bi_rw & REQ_WRITE;
510         struct dm_stat_shared *shared = &s->stat_shared[entry];
511         struct dm_stat_percpu *p;
512
513         /*
514          * For strict correctness we should use local_irq_save/restore
515          * instead of preempt_disable/enable.
516          *
517          * preempt_disable/enable is racy if the driver finishes bios
518          * from non-interrupt context as well as from interrupt context
519          * or from more different interrupts.
520          *
521          * On 64-bit architectures the race only results in not counting some
522          * events, so it is acceptable.  On 32-bit architectures the race could
523          * cause the counter going off by 2^32, so we need to do proper locking
524          * there.
525          *
526          * part_stat_lock()/part_stat_unlock() have this race too.
527          */
528 #if BITS_PER_LONG == 32
529         unsigned long flags;
530         local_irq_save(flags);
531 #else
532         preempt_disable();
533 #endif
534         p = &s->stat_percpu[smp_processor_id()][entry];
535
536         if (!end) {
537                 dm_stat_round(s, shared, p);
538                 atomic_inc(&shared->in_flight[idx]);
539         } else {
540                 unsigned long long duration;
541                 dm_stat_round(s, shared, p);
542                 atomic_dec(&shared->in_flight[idx]);
543                 p->sectors[idx] += len;
544                 p->ios[idx] += 1;
545                 p->merges[idx] += stats_aux->merged;
546                 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
547                         p->ticks[idx] += duration_jiffies;
548                         duration = jiffies_to_msecs(duration_jiffies);
549                 } else {
550                         p->ticks[idx] += stats_aux->duration_ns;
551                         duration = stats_aux->duration_ns;
552                 }
553                 if (s->n_histogram_entries) {
554                         unsigned lo = 0, hi = s->n_histogram_entries + 1;
555                         while (lo + 1 < hi) {
556                                 unsigned mid = (lo + hi) / 2;
557                                 if (s->histogram_boundaries[mid - 1] > duration) {
558                                         hi = mid;
559                                 } else {
560                                         lo = mid;
561                                 }
562
563                         }
564                         p->histogram[lo]++;
565                 }
566         }
567
568 #if BITS_PER_LONG == 32
569         local_irq_restore(flags);
570 #else
571         preempt_enable();
572 #endif
573 }
574
575 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
576                           sector_t bi_sector, sector_t end_sector,
577                           bool end, unsigned long duration_jiffies,
578                           struct dm_stats_aux *stats_aux)
579 {
580         sector_t rel_sector, offset, todo, fragment_len;
581         size_t entry;
582
583         if (end_sector <= s->start || bi_sector >= s->end)
584                 return;
585         if (unlikely(bi_sector < s->start)) {
586                 rel_sector = 0;
587                 todo = end_sector - s->start;
588         } else {
589                 rel_sector = bi_sector - s->start;
590                 todo = end_sector - bi_sector;
591         }
592         if (unlikely(end_sector > s->end))
593                 todo -= (end_sector - s->end);
594
595         offset = dm_sector_div64(rel_sector, s->step);
596         entry = rel_sector;
597         do {
598                 if (WARN_ON_ONCE(entry >= s->n_entries)) {
599                         DMCRIT("Invalid area access in region id %d", s->id);
600                         return;
601                 }
602                 fragment_len = todo;
603                 if (fragment_len > s->step - offset)
604                         fragment_len = s->step - offset;
605                 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
606                                   stats_aux, end, duration_jiffies);
607                 todo -= fragment_len;
608                 entry++;
609                 offset = 0;
610         } while (unlikely(todo != 0));
611 }
612
613 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
614                          sector_t bi_sector, unsigned bi_sectors, bool end,
615                          unsigned long duration_jiffies,
616                          struct dm_stats_aux *stats_aux)
617 {
618         struct dm_stat *s;
619         sector_t end_sector;
620         struct dm_stats_last_position *last;
621         bool got_precise_time;
622
623         if (unlikely(!bi_sectors))
624                 return;
625
626         end_sector = bi_sector + bi_sectors;
627
628         if (!end) {
629                 /*
630                  * A race condition can at worst result in the merged flag being
631                  * misrepresented, so we don't have to disable preemption here.
632                  */
633                 last = raw_cpu_ptr(stats->last);
634                 stats_aux->merged =
635                         (bi_sector == (ACCESS_ONCE(last->last_sector) &&
636                                        ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
637                                         (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
638                                        ));
639                 ACCESS_ONCE(last->last_sector) = end_sector;
640                 ACCESS_ONCE(last->last_rw) = bi_rw;
641         }
642
643         rcu_read_lock();
644
645         got_precise_time = false;
646         list_for_each_entry_rcu(s, &stats->list, list_entry) {
647                 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
648                         if (!end)
649                                 stats_aux->duration_ns = ktime_to_ns(ktime_get());
650                         else
651                                 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
652                         got_precise_time = true;
653                 }
654                 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
655         }
656
657         rcu_read_unlock();
658 }
659
660 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
661                                                    struct dm_stat *s, size_t x)
662 {
663         int cpu;
664         struct dm_stat_percpu *p;
665
666         local_irq_disable();
667         p = &s->stat_percpu[smp_processor_id()][x];
668         dm_stat_round(s, shared, p);
669         local_irq_enable();
670
671         shared->tmp.sectors[READ] = 0;
672         shared->tmp.sectors[WRITE] = 0;
673         shared->tmp.ios[READ] = 0;
674         shared->tmp.ios[WRITE] = 0;
675         shared->tmp.merges[READ] = 0;
676         shared->tmp.merges[WRITE] = 0;
677         shared->tmp.ticks[READ] = 0;
678         shared->tmp.ticks[WRITE] = 0;
679         shared->tmp.io_ticks[READ] = 0;
680         shared->tmp.io_ticks[WRITE] = 0;
681         shared->tmp.io_ticks_total = 0;
682         shared->tmp.time_in_queue = 0;
683
684         if (s->n_histogram_entries)
685                 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
686
687         for_each_possible_cpu(cpu) {
688                 p = &s->stat_percpu[cpu][x];
689                 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
690                 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
691                 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
692                 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
693                 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
694                 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
695                 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
696                 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
697                 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
698                 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
699                 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
700                 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
701                 if (s->n_histogram_entries) {
702                         unsigned i;
703                         for (i = 0; i < s->n_histogram_entries + 1; i++)
704                                 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
705                 }
706         }
707 }
708
709 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
710                             bool init_tmp_percpu_totals)
711 {
712         size_t x;
713         struct dm_stat_shared *shared;
714         struct dm_stat_percpu *p;
715
716         for (x = idx_start; x < idx_end; x++) {
717                 shared = &s->stat_shared[x];
718                 if (init_tmp_percpu_totals)
719                         __dm_stat_init_temporary_percpu_totals(shared, s, x);
720                 local_irq_disable();
721                 p = &s->stat_percpu[smp_processor_id()][x];
722                 p->sectors[READ] -= shared->tmp.sectors[READ];
723                 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
724                 p->ios[READ] -= shared->tmp.ios[READ];
725                 p->ios[WRITE] -= shared->tmp.ios[WRITE];
726                 p->merges[READ] -= shared->tmp.merges[READ];
727                 p->merges[WRITE] -= shared->tmp.merges[WRITE];
728                 p->ticks[READ] -= shared->tmp.ticks[READ];
729                 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
730                 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
731                 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
732                 p->io_ticks_total -= shared->tmp.io_ticks_total;
733                 p->time_in_queue -= shared->tmp.time_in_queue;
734                 local_irq_enable();
735                 if (s->n_histogram_entries) {
736                         unsigned i;
737                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
738                                 local_irq_disable();
739                                 p = &s->stat_percpu[smp_processor_id()][x];
740                                 p->histogram[i] -= shared->tmp.histogram[i];
741                                 local_irq_enable();
742                         }
743                 }
744         }
745 }
746
747 static int dm_stats_clear(struct dm_stats *stats, int id)
748 {
749         struct dm_stat *s;
750
751         mutex_lock(&stats->mutex);
752
753         s = __dm_stats_find(stats, id);
754         if (!s) {
755                 mutex_unlock(&stats->mutex);
756                 return -ENOENT;
757         }
758
759         __dm_stat_clear(s, 0, s->n_entries, true);
760
761         mutex_unlock(&stats->mutex);
762
763         return 1;
764 }
765
766 /*
767  * This is like jiffies_to_msec, but works for 64-bit values.
768  */
769 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
770 {
771         unsigned long long result;
772         unsigned mult;
773
774         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
775                 return j;
776
777         result = 0;
778         if (j)
779                 result = jiffies_to_msecs(j & 0x3fffff);
780         if (j >= 1 << 22) {
781                 mult = jiffies_to_msecs(1 << 22);
782                 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
783         }
784         if (j >= 1ULL << 44)
785                 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
786
787         return result;
788 }
789
790 static int dm_stats_print(struct dm_stats *stats, int id,
791                           size_t idx_start, size_t idx_len,
792                           bool clear, char *result, unsigned maxlen)
793 {
794         unsigned sz = 0;
795         struct dm_stat *s;
796         size_t x;
797         sector_t start, end, step;
798         size_t idx_end;
799         struct dm_stat_shared *shared;
800
801         /*
802          * Output format:
803          *   <start_sector>+<length> counters
804          */
805
806         mutex_lock(&stats->mutex);
807
808         s = __dm_stats_find(stats, id);
809         if (!s) {
810                 mutex_unlock(&stats->mutex);
811                 return -ENOENT;
812         }
813
814         idx_end = idx_start + idx_len;
815         if (idx_end < idx_start ||
816             idx_end > s->n_entries)
817                 idx_end = s->n_entries;
818
819         if (idx_start > idx_end)
820                 idx_start = idx_end;
821
822         step = s->step;
823         start = s->start + (step * idx_start);
824
825         for (x = idx_start; x < idx_end; x++, start = end) {
826                 shared = &s->stat_shared[x];
827                 end = start + step;
828                 if (unlikely(end > s->end))
829                         end = s->end;
830
831                 __dm_stat_init_temporary_percpu_totals(shared, s, x);
832
833                 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
834                        (unsigned long long)start,
835                        (unsigned long long)step,
836                        shared->tmp.ios[READ],
837                        shared->tmp.merges[READ],
838                        shared->tmp.sectors[READ],
839                        dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
840                        shared->tmp.ios[WRITE],
841                        shared->tmp.merges[WRITE],
842                        shared->tmp.sectors[WRITE],
843                        dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
844                        dm_stat_in_flight(shared),
845                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
846                        dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
847                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
848                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
849                 if (s->n_histogram_entries) {
850                         unsigned i;
851                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
852                                 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
853                         }
854                 }
855                 DMEMIT("\n");
856
857                 if (unlikely(sz + 1 >= maxlen))
858                         goto buffer_overflow;
859         }
860
861         if (clear)
862                 __dm_stat_clear(s, idx_start, idx_end, false);
863
864 buffer_overflow:
865         mutex_unlock(&stats->mutex);
866
867         return 1;
868 }
869
870 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
871 {
872         struct dm_stat *s;
873         const char *new_aux_data;
874
875         mutex_lock(&stats->mutex);
876
877         s = __dm_stats_find(stats, id);
878         if (!s) {
879                 mutex_unlock(&stats->mutex);
880                 return -ENOENT;
881         }
882
883         new_aux_data = kstrdup(aux_data, GFP_KERNEL);
884         if (!new_aux_data) {
885                 mutex_unlock(&stats->mutex);
886                 return -ENOMEM;
887         }
888
889         kfree(s->aux_data);
890         s->aux_data = new_aux_data;
891
892         mutex_unlock(&stats->mutex);
893
894         return 0;
895 }
896
897 static int parse_histogram(const char *h, unsigned *n_histogram_entries,
898                            unsigned long long **histogram_boundaries)
899 {
900         const char *q;
901         unsigned n;
902         unsigned long long last;
903
904         *n_histogram_entries = 1;
905         for (q = h; *q; q++)
906                 if (*q == ',')
907                         (*n_histogram_entries)++;
908
909         *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
910         if (!*histogram_boundaries)
911                 return -ENOMEM;
912
913         n = 0;
914         last = 0;
915         while (1) {
916                 unsigned long long hi;
917                 int s;
918                 char ch;
919                 s = sscanf(h, "%llu%c", &hi, &ch);
920                 if (!s || (s == 2 && ch != ','))
921                         return -EINVAL;
922                 if (hi <= last)
923                         return -EINVAL;
924                 last = hi;
925                 (*histogram_boundaries)[n] = hi;
926                 if (s == 1)
927                         return 0;
928                 h = strchr(h, ',') + 1;
929                 n++;
930         }
931 }
932
933 static int message_stats_create(struct mapped_device *md,
934                                 unsigned argc, char **argv,
935                                 char *result, unsigned maxlen)
936 {
937         int r;
938         int id;
939         char dummy;
940         unsigned long long start, end, len, step;
941         unsigned divisor;
942         const char *program_id, *aux_data;
943         unsigned stat_flags = 0;
944
945         unsigned n_histogram_entries = 0;
946         unsigned long long *histogram_boundaries = NULL;
947
948         struct dm_arg_set as, as_backup;
949         const char *a;
950         unsigned feature_args;
951
952         /*
953          * Input format:
954          *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
955          */
956
957         if (argc < 3)
958                 goto ret_einval;
959
960         as.argc = argc;
961         as.argv = argv;
962         dm_consume_args(&as, 1);
963
964         a = dm_shift_arg(&as);
965         if (!strcmp(a, "-")) {
966                 start = 0;
967                 len = dm_get_size(md);
968                 if (!len)
969                         len = 1;
970         } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
971                    start != (sector_t)start || len != (sector_t)len)
972                 goto ret_einval;
973
974         end = start + len;
975         if (start >= end)
976                 goto ret_einval;
977
978         a = dm_shift_arg(&as);
979         if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
980                 if (!divisor)
981                         return -EINVAL;
982                 step = end - start;
983                 if (do_div(step, divisor))
984                         step++;
985                 if (!step)
986                         step = 1;
987         } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
988                    step != (sector_t)step || !step)
989                 goto ret_einval;
990
991         as_backup = as;
992         a = dm_shift_arg(&as);
993         if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
994                 while (feature_args--) {
995                         a = dm_shift_arg(&as);
996                         if (!a)
997                                 goto ret_einval;
998                         if (!strcasecmp(a, "precise_timestamps"))
999                                 stat_flags |= STAT_PRECISE_TIMESTAMPS;
1000                         else if (!strncasecmp(a, "histogram:", 10)) {
1001                                 if (n_histogram_entries)
1002                                         goto ret_einval;
1003                                 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
1004                                         goto ret;
1005                         } else
1006                                 goto ret_einval;
1007                 }
1008         } else {
1009                 as = as_backup;
1010         }
1011
1012         program_id = "-";
1013         aux_data = "-";
1014
1015         a = dm_shift_arg(&as);
1016         if (a)
1017                 program_id = a;
1018
1019         a = dm_shift_arg(&as);
1020         if (a)
1021                 aux_data = a;
1022
1023         if (as.argc)
1024                 goto ret_einval;
1025
1026         /*
1027          * If a buffer overflow happens after we created the region,
1028          * it's too late (the userspace would retry with a larger
1029          * buffer, but the region id that caused the overflow is already
1030          * leaked).  So we must detect buffer overflow in advance.
1031          */
1032         snprintf(result, maxlen, "%d", INT_MAX);
1033         if (dm_message_test_buffer_overflow(result, maxlen)) {
1034                 r = 1;
1035                 goto ret;
1036         }
1037
1038         id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1039                              n_histogram_entries, histogram_boundaries, program_id, aux_data,
1040                              dm_internal_suspend_fast, dm_internal_resume_fast, md);
1041         if (id < 0) {
1042                 r = id;
1043                 goto ret;
1044         }
1045
1046         snprintf(result, maxlen, "%d", id);
1047
1048         r = 1;
1049         goto ret;
1050
1051 ret_einval:
1052         r = -EINVAL;
1053 ret:
1054         kfree(histogram_boundaries);
1055         return r;
1056 }
1057
1058 static int message_stats_delete(struct mapped_device *md,
1059                                 unsigned argc, char **argv)
1060 {
1061         int id;
1062         char dummy;
1063
1064         if (argc != 2)
1065                 return -EINVAL;
1066
1067         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1068                 return -EINVAL;
1069
1070         return dm_stats_delete(dm_get_stats(md), id);
1071 }
1072
1073 static int message_stats_clear(struct mapped_device *md,
1074                                unsigned argc, char **argv)
1075 {
1076         int id;
1077         char dummy;
1078
1079         if (argc != 2)
1080                 return -EINVAL;
1081
1082         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1083                 return -EINVAL;
1084
1085         return dm_stats_clear(dm_get_stats(md), id);
1086 }
1087
1088 static int message_stats_list(struct mapped_device *md,
1089                               unsigned argc, char **argv,
1090                               char *result, unsigned maxlen)
1091 {
1092         int r;
1093         const char *program = NULL;
1094
1095         if (argc < 1 || argc > 2)
1096                 return -EINVAL;
1097
1098         if (argc > 1) {
1099                 program = kstrdup(argv[1], GFP_KERNEL);
1100                 if (!program)
1101                         return -ENOMEM;
1102         }
1103
1104         r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1105
1106         kfree(program);
1107
1108         return r;
1109 }
1110
1111 static int message_stats_print(struct mapped_device *md,
1112                                unsigned argc, char **argv, bool clear,
1113                                char *result, unsigned maxlen)
1114 {
1115         int id;
1116         char dummy;
1117         unsigned long idx_start = 0, idx_len = ULONG_MAX;
1118
1119         if (argc != 2 && argc != 4)
1120                 return -EINVAL;
1121
1122         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1123                 return -EINVAL;
1124
1125         if (argc > 3) {
1126                 if (strcmp(argv[2], "-") &&
1127                     sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1128                         return -EINVAL;
1129                 if (strcmp(argv[3], "-") &&
1130                     sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1131                         return -EINVAL;
1132         }
1133
1134         return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1135                               result, maxlen);
1136 }
1137
1138 static int message_stats_set_aux(struct mapped_device *md,
1139                                  unsigned argc, char **argv)
1140 {
1141         int id;
1142         char dummy;
1143
1144         if (argc != 3)
1145                 return -EINVAL;
1146
1147         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1148                 return -EINVAL;
1149
1150         return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1151 }
1152
1153 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
1154                      char *result, unsigned maxlen)
1155 {
1156         int r;
1157
1158         /* All messages here must start with '@' */
1159         if (!strcasecmp(argv[0], "@stats_create"))
1160                 r = message_stats_create(md, argc, argv, result, maxlen);
1161         else if (!strcasecmp(argv[0], "@stats_delete"))
1162                 r = message_stats_delete(md, argc, argv);
1163         else if (!strcasecmp(argv[0], "@stats_clear"))
1164                 r = message_stats_clear(md, argc, argv);
1165         else if (!strcasecmp(argv[0], "@stats_list"))
1166                 r = message_stats_list(md, argc, argv, result, maxlen);
1167         else if (!strcasecmp(argv[0], "@stats_print"))
1168                 r = message_stats_print(md, argc, argv, false, result, maxlen);
1169         else if (!strcasecmp(argv[0], "@stats_print_clear"))
1170                 r = message_stats_print(md, argc, argv, true, result, maxlen);
1171         else if (!strcasecmp(argv[0], "@stats_set_aux"))
1172                 r = message_stats_set_aux(md, argc, argv);
1173         else
1174                 return 2; /* this wasn't a stats message */
1175
1176         if (r == -EINVAL)
1177                 DMWARN("Invalid parameters for message %s", argv[0]);
1178
1179         return r;
1180 }
1181
1182 int __init dm_statistics_init(void)
1183 {
1184         shared_memory_amount = 0;
1185         dm_stat_need_rcu_barrier = 0;
1186         return 0;
1187 }
1188
1189 void dm_statistics_exit(void)
1190 {
1191         if (dm_stat_need_rcu_barrier)
1192                 rcu_barrier();
1193         if (WARN_ON(shared_memory_amount))
1194                 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1195 }
1196
1197 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
1198 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");