OSDN Git Service

63b6ce78492aada84756b417293444c2c7b2b454
[uclinux-h8/linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
53
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
56
57 #define ROOT_SIZE               VTD_PAGE_SIZE
58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
59
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64
65 #define IOAPIC_RANGE_START      (0xfee00000)
66 #define IOAPIC_RANGE_END        (0xfeefffff)
67 #define IOVA_START_ADDR         (0x1000)
68
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73
74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN          (1)
85
86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187 static int no_platform_optin;
188
189 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
190
191 /*
192  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
193  * if marked present.
194  */
195 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 {
197         if (!(re->lo & 1))
198                 return 0;
199
200         return re->lo & VTD_PAGE_MASK;
201 }
202
203 /*
204  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
205  * if marked present.
206  */
207 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 {
209         if (!(re->hi & 1))
210                 return 0;
211
212         return re->hi & VTD_PAGE_MASK;
213 }
214
215 static inline void context_clear_pasid_enable(struct context_entry *context)
216 {
217         context->lo &= ~(1ULL << 11);
218 }
219
220 static inline bool context_pasid_enabled(struct context_entry *context)
221 {
222         return !!(context->lo & (1ULL << 11));
223 }
224
225 static inline void context_set_copied(struct context_entry *context)
226 {
227         context->hi |= (1ull << 3);
228 }
229
230 static inline bool context_copied(struct context_entry *context)
231 {
232         return !!(context->hi & (1ULL << 3));
233 }
234
235 static inline bool __context_present(struct context_entry *context)
236 {
237         return (context->lo & 1);
238 }
239
240 bool context_present(struct context_entry *context)
241 {
242         return context_pasid_enabled(context) ?
243              __context_present(context) :
244              __context_present(context) && !context_copied(context);
245 }
246
247 static inline void context_set_present(struct context_entry *context)
248 {
249         context->lo |= 1;
250 }
251
252 static inline void context_set_fault_enable(struct context_entry *context)
253 {
254         context->lo &= (((u64)-1) << 2) | 1;
255 }
256
257 static inline void context_set_translation_type(struct context_entry *context,
258                                                 unsigned long value)
259 {
260         context->lo &= (((u64)-1) << 4) | 3;
261         context->lo |= (value & 3) << 2;
262 }
263
264 static inline void context_set_address_root(struct context_entry *context,
265                                             unsigned long value)
266 {
267         context->lo &= ~VTD_PAGE_MASK;
268         context->lo |= value & VTD_PAGE_MASK;
269 }
270
271 static inline void context_set_address_width(struct context_entry *context,
272                                              unsigned long value)
273 {
274         context->hi |= value & 7;
275 }
276
277 static inline void context_set_domain_id(struct context_entry *context,
278                                          unsigned long value)
279 {
280         context->hi |= (value & ((1 << 16) - 1)) << 8;
281 }
282
283 static inline int context_domain_id(struct context_entry *c)
284 {
285         return((c->hi >> 8) & 0xffff);
286 }
287
288 static inline void context_clear_entry(struct context_entry *context)
289 {
290         context->lo = 0;
291         context->hi = 0;
292 }
293
294 /*
295  * 0: readable
296  * 1: writable
297  * 2-6: reserved
298  * 7: super page
299  * 8-10: available
300  * 11: snoop behavior
301  * 12-63: Host physcial address
302  */
303 struct dma_pte {
304         u64 val;
305 };
306
307 static inline void dma_clear_pte(struct dma_pte *pte)
308 {
309         pte->val = 0;
310 }
311
312 static inline u64 dma_pte_addr(struct dma_pte *pte)
313 {
314 #ifdef CONFIG_64BIT
315         return pte->val & VTD_PAGE_MASK;
316 #else
317         /* Must have a full atomic 64-bit read */
318         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
319 #endif
320 }
321
322 static inline bool dma_pte_present(struct dma_pte *pte)
323 {
324         return (pte->val & 3) != 0;
325 }
326
327 static inline bool dma_pte_superpage(struct dma_pte *pte)
328 {
329         return (pte->val & DMA_PTE_LARGE_PAGE);
330 }
331
332 static inline int first_pte_in_page(struct dma_pte *pte)
333 {
334         return !((unsigned long)pte & ~VTD_PAGE_MASK);
335 }
336
337 /*
338  * This domain is a statically identity mapping domain.
339  *      1. This domain creats a static 1:1 mapping to all usable memory.
340  *      2. It maps to each iommu if successful.
341  *      3. Each iommu mapps to this domain if successful.
342  */
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
345
346 /*
347  * Domain represents a virtual machine, more than one devices
348  * across iommus may be owned in one domain, e.g. kvm guest.
349  */
350 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
351
352 /* si_domain contains mulitple devices */
353 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
354
355 #define for_each_domain_iommu(idx, domain)                      \
356         for (idx = 0; idx < g_num_of_iommus; idx++)             \
357                 if (domain->iommu_refcnt[idx])
358
359 struct dmar_rmrr_unit {
360         struct list_head list;          /* list of rmrr units   */
361         struct acpi_dmar_header *hdr;   /* ACPI header          */
362         u64     base_address;           /* reserved base address*/
363         u64     end_address;            /* reserved end address */
364         struct dmar_dev_scope *devices; /* target devices */
365         int     devices_cnt;            /* target device count */
366         struct iommu_resv_region *resv; /* reserved region handle */
367 };
368
369 struct dmar_atsr_unit {
370         struct list_head list;          /* list of ATSR units */
371         struct acpi_dmar_header *hdr;   /* ACPI header */
372         struct dmar_dev_scope *devices; /* target devices */
373         int devices_cnt;                /* target device count */
374         u8 include_all:1;               /* include all ports */
375 };
376
377 static LIST_HEAD(dmar_atsr_units);
378 static LIST_HEAD(dmar_rmrr_units);
379
380 #define for_each_rmrr_units(rmrr) \
381         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
382
383 /* bitmap for indexing intel_iommus */
384 static int g_num_of_iommus;
385
386 static void domain_exit(struct dmar_domain *domain);
387 static void domain_remove_dev_info(struct dmar_domain *domain);
388 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
389                                      struct device *dev);
390 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
391 static void domain_context_clear(struct intel_iommu *iommu,
392                                  struct device *dev);
393 static int domain_detach_iommu(struct dmar_domain *domain,
394                                struct intel_iommu *iommu);
395
396 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
397 int dmar_disabled = 0;
398 #else
399 int dmar_disabled = 1;
400 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
401
402 int intel_iommu_enabled = 0;
403 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
404
405 static int dmar_map_gfx = 1;
406 static int dmar_forcedac;
407 static int intel_iommu_strict;
408 static int intel_iommu_superpage = 1;
409 static int intel_iommu_ecs = 1;
410 static int intel_iommu_pasid28;
411 static int iommu_identity_mapping;
412
413 #define IDENTMAP_ALL            1
414 #define IDENTMAP_GFX            2
415 #define IDENTMAP_AZALIA         4
416
417 /* Broadwell and Skylake have broken ECS support — normal so-called "second
418  * level" translation of DMA requests-without-PASID doesn't actually happen
419  * unless you also set the NESTE bit in an extended context-entry. Which of
420  * course means that SVM doesn't work because it's trying to do nested
421  * translation of the physical addresses it finds in the process page tables,
422  * through the IOVA->phys mapping found in the "second level" page tables.
423  *
424  * The VT-d specification was retroactively changed to change the definition
425  * of the capability bits and pretend that Broadwell/Skylake never happened...
426  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
427  * for some reason it was the PASID capability bit which was redefined (from
428  * bit 28 on BDW/SKL to bit 40 in future).
429  *
430  * So our test for ECS needs to eschew those implementations which set the old
431  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
432  * Unless we are working around the 'pasid28' limitations, that is, by putting
433  * the device into passthrough mode for normal DMA and thus masking the bug.
434  */
435 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
436                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
437 /* PASID support is thus enabled if ECS is enabled and *either* of the old
438  * or new capability bits are set. */
439 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
440                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 /*
450  * Iterate over elements in device_domain_list and call the specified
451  * callback @fn against each element. This helper should only be used
452  * in the context where the device_domain_lock has already been holden.
453  */
454 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
455                                      void *data), void *data)
456 {
457         int ret = 0;
458         struct device_domain_info *info;
459
460         assert_spin_locked(&device_domain_lock);
461         list_for_each_entry(info, &device_domain_list, global) {
462                 ret = fn(info, data);
463                 if (ret)
464                         return ret;
465         }
466
467         return 0;
468 }
469
470 const struct iommu_ops intel_iommu_ops;
471
472 static bool translation_pre_enabled(struct intel_iommu *iommu)
473 {
474         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
475 }
476
477 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
478 {
479         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
480 }
481
482 static void init_translation_status(struct intel_iommu *iommu)
483 {
484         u32 gsts;
485
486         gsts = readl(iommu->reg + DMAR_GSTS_REG);
487         if (gsts & DMA_GSTS_TES)
488                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
489 }
490
491 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
492 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
493 {
494         return container_of(dom, struct dmar_domain, domain);
495 }
496
497 static int __init intel_iommu_setup(char *str)
498 {
499         if (!str)
500                 return -EINVAL;
501         while (*str) {
502                 if (!strncmp(str, "on", 2)) {
503                         dmar_disabled = 0;
504                         pr_info("IOMMU enabled\n");
505                 } else if (!strncmp(str, "off", 3)) {
506                         dmar_disabled = 1;
507                         no_platform_optin = 1;
508                         pr_info("IOMMU disabled\n");
509                 } else if (!strncmp(str, "igfx_off", 8)) {
510                         dmar_map_gfx = 0;
511                         pr_info("Disable GFX device mapping\n");
512                 } else if (!strncmp(str, "forcedac", 8)) {
513                         pr_info("Forcing DAC for PCI devices\n");
514                         dmar_forcedac = 1;
515                 } else if (!strncmp(str, "strict", 6)) {
516                         pr_info("Disable batched IOTLB flush\n");
517                         intel_iommu_strict = 1;
518                 } else if (!strncmp(str, "sp_off", 6)) {
519                         pr_info("Disable supported super page\n");
520                         intel_iommu_superpage = 0;
521                 } else if (!strncmp(str, "ecs_off", 7)) {
522                         printk(KERN_INFO
523                                 "Intel-IOMMU: disable extended context table support\n");
524                         intel_iommu_ecs = 0;
525                 } else if (!strncmp(str, "pasid28", 7)) {
526                         printk(KERN_INFO
527                                 "Intel-IOMMU: enable pre-production PASID support\n");
528                         intel_iommu_pasid28 = 1;
529                         iommu_identity_mapping |= IDENTMAP_GFX;
530                 } else if (!strncmp(str, "tboot_noforce", 13)) {
531                         printk(KERN_INFO
532                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
533                         intel_iommu_tboot_noforce = 1;
534                 }
535
536                 str += strcspn(str, ",");
537                 while (*str == ',')
538                         str++;
539         }
540         return 0;
541 }
542 __setup("intel_iommu=", intel_iommu_setup);
543
544 static struct kmem_cache *iommu_domain_cache;
545 static struct kmem_cache *iommu_devinfo_cache;
546
547 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
548 {
549         struct dmar_domain **domains;
550         int idx = did >> 8;
551
552         domains = iommu->domains[idx];
553         if (!domains)
554                 return NULL;
555
556         return domains[did & 0xff];
557 }
558
559 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
560                              struct dmar_domain *domain)
561 {
562         struct dmar_domain **domains;
563         int idx = did >> 8;
564
565         if (!iommu->domains[idx]) {
566                 size_t size = 256 * sizeof(struct dmar_domain *);
567                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
568         }
569
570         domains = iommu->domains[idx];
571         if (WARN_ON(!domains))
572                 return;
573         else
574                 domains[did & 0xff] = domain;
575 }
576
577 void *alloc_pgtable_page(int node)
578 {
579         struct page *page;
580         void *vaddr = NULL;
581
582         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
583         if (page)
584                 vaddr = page_address(page);
585         return vaddr;
586 }
587
588 void free_pgtable_page(void *vaddr)
589 {
590         free_page((unsigned long)vaddr);
591 }
592
593 static inline void *alloc_domain_mem(void)
594 {
595         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
596 }
597
598 static void free_domain_mem(void *vaddr)
599 {
600         kmem_cache_free(iommu_domain_cache, vaddr);
601 }
602
603 static inline void * alloc_devinfo_mem(void)
604 {
605         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
606 }
607
608 static inline void free_devinfo_mem(void *vaddr)
609 {
610         kmem_cache_free(iommu_devinfo_cache, vaddr);
611 }
612
613 static inline int domain_type_is_vm(struct dmar_domain *domain)
614 {
615         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
616 }
617
618 static inline int domain_type_is_si(struct dmar_domain *domain)
619 {
620         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
621 }
622
623 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
624 {
625         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
626                                 DOMAIN_FLAG_STATIC_IDENTITY);
627 }
628
629 static inline int domain_pfn_supported(struct dmar_domain *domain,
630                                        unsigned long pfn)
631 {
632         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
633
634         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
635 }
636
637 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
638 {
639         unsigned long sagaw;
640         int agaw = -1;
641
642         sagaw = cap_sagaw(iommu->cap);
643         for (agaw = width_to_agaw(max_gaw);
644              agaw >= 0; agaw--) {
645                 if (test_bit(agaw, &sagaw))
646                         break;
647         }
648
649         return agaw;
650 }
651
652 /*
653  * Calculate max SAGAW for each iommu.
654  */
655 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
656 {
657         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
658 }
659
660 /*
661  * calculate agaw for each iommu.
662  * "SAGAW" may be different across iommus, use a default agaw, and
663  * get a supported less agaw for iommus that don't support the default agaw.
664  */
665 int iommu_calculate_agaw(struct intel_iommu *iommu)
666 {
667         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
668 }
669
670 /* This functionin only returns single iommu in a domain */
671 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
672 {
673         int iommu_id;
674
675         /* si_domain and vm domain should not get here. */
676         BUG_ON(domain_type_is_vm_or_si(domain));
677         for_each_domain_iommu(iommu_id, domain)
678                 break;
679
680         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
681                 return NULL;
682
683         return g_iommus[iommu_id];
684 }
685
686 static void domain_update_iommu_coherency(struct dmar_domain *domain)
687 {
688         struct dmar_drhd_unit *drhd;
689         struct intel_iommu *iommu;
690         bool found = false;
691         int i;
692
693         domain->iommu_coherency = 1;
694
695         for_each_domain_iommu(i, domain) {
696                 found = true;
697                 if (!ecap_coherent(g_iommus[i]->ecap)) {
698                         domain->iommu_coherency = 0;
699                         break;
700                 }
701         }
702         if (found)
703                 return;
704
705         /* No hardware attached; use lowest common denominator */
706         rcu_read_lock();
707         for_each_active_iommu(iommu, drhd) {
708                 if (!ecap_coherent(iommu->ecap)) {
709                         domain->iommu_coherency = 0;
710                         break;
711                 }
712         }
713         rcu_read_unlock();
714 }
715
716 static int domain_update_iommu_snooping(struct intel_iommu *skip)
717 {
718         struct dmar_drhd_unit *drhd;
719         struct intel_iommu *iommu;
720         int ret = 1;
721
722         rcu_read_lock();
723         for_each_active_iommu(iommu, drhd) {
724                 if (iommu != skip) {
725                         if (!ecap_sc_support(iommu->ecap)) {
726                                 ret = 0;
727                                 break;
728                         }
729                 }
730         }
731         rcu_read_unlock();
732
733         return ret;
734 }
735
736 static int domain_update_iommu_superpage(struct intel_iommu *skip)
737 {
738         struct dmar_drhd_unit *drhd;
739         struct intel_iommu *iommu;
740         int mask = 0xf;
741
742         if (!intel_iommu_superpage) {
743                 return 0;
744         }
745
746         /* set iommu_superpage to the smallest common denominator */
747         rcu_read_lock();
748         for_each_active_iommu(iommu, drhd) {
749                 if (iommu != skip) {
750                         mask &= cap_super_page_val(iommu->cap);
751                         if (!mask)
752                                 break;
753                 }
754         }
755         rcu_read_unlock();
756
757         return fls(mask);
758 }
759
760 /* Some capabilities may be different across iommus */
761 static void domain_update_iommu_cap(struct dmar_domain *domain)
762 {
763         domain_update_iommu_coherency(domain);
764         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
765         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
766 }
767
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769                                          u8 devfn, int alloc)
770 {
771         struct root_entry *root = &iommu->root_entry[bus];
772         struct context_entry *context;
773         u64 *entry;
774
775         entry = &root->lo;
776         if (ecs_enabled(iommu)) {
777                 if (devfn >= 0x80) {
778                         devfn -= 0x80;
779                         entry = &root->hi;
780                 }
781                 devfn *= 2;
782         }
783         if (*entry & 1)
784                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
785         else {
786                 unsigned long phy_addr;
787                 if (!alloc)
788                         return NULL;
789
790                 context = alloc_pgtable_page(iommu->node);
791                 if (!context)
792                         return NULL;
793
794                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795                 phy_addr = virt_to_phys((void *)context);
796                 *entry = phy_addr | 1;
797                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
798         }
799         return &context[devfn];
800 }
801
802 static int iommu_dummy(struct device *dev)
803 {
804         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
805 }
806
807 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
808 {
809         struct dmar_drhd_unit *drhd = NULL;
810         struct intel_iommu *iommu;
811         struct device *tmp;
812         struct pci_dev *ptmp, *pdev = NULL;
813         u16 segment = 0;
814         int i;
815
816         if (iommu_dummy(dev))
817                 return NULL;
818
819         if (dev_is_pci(dev)) {
820                 struct pci_dev *pf_pdev;
821
822                 pdev = to_pci_dev(dev);
823
824 #ifdef CONFIG_X86
825                 /* VMD child devices currently cannot be handled individually */
826                 if (is_vmd(pdev->bus))
827                         return NULL;
828 #endif
829
830                 /* VFs aren't listed in scope tables; we need to look up
831                  * the PF instead to find the IOMMU. */
832                 pf_pdev = pci_physfn(pdev);
833                 dev = &pf_pdev->dev;
834                 segment = pci_domain_nr(pdev->bus);
835         } else if (has_acpi_companion(dev))
836                 dev = &ACPI_COMPANION(dev)->dev;
837
838         rcu_read_lock();
839         for_each_active_iommu(iommu, drhd) {
840                 if (pdev && segment != drhd->segment)
841                         continue;
842
843                 for_each_active_dev_scope(drhd->devices,
844                                           drhd->devices_cnt, i, tmp) {
845                         if (tmp == dev) {
846                                 /* For a VF use its original BDF# not that of the PF
847                                  * which we used for the IOMMU lookup. Strictly speaking
848                                  * we could do this for all PCI devices; we only need to
849                                  * get the BDF# from the scope table for ACPI matches. */
850                                 if (pdev && pdev->is_virtfn)
851                                         goto got_pdev;
852
853                                 *bus = drhd->devices[i].bus;
854                                 *devfn = drhd->devices[i].devfn;
855                                 goto out;
856                         }
857
858                         if (!pdev || !dev_is_pci(tmp))
859                                 continue;
860
861                         ptmp = to_pci_dev(tmp);
862                         if (ptmp->subordinate &&
863                             ptmp->subordinate->number <= pdev->bus->number &&
864                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
865                                 goto got_pdev;
866                 }
867
868                 if (pdev && drhd->include_all) {
869                 got_pdev:
870                         *bus = pdev->bus->number;
871                         *devfn = pdev->devfn;
872                         goto out;
873                 }
874         }
875         iommu = NULL;
876  out:
877         rcu_read_unlock();
878
879         return iommu;
880 }
881
882 static void domain_flush_cache(struct dmar_domain *domain,
883                                void *addr, int size)
884 {
885         if (!domain->iommu_coherency)
886                 clflush_cache_range(addr, size);
887 }
888
889 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
890 {
891         struct context_entry *context;
892         int ret = 0;
893         unsigned long flags;
894
895         spin_lock_irqsave(&iommu->lock, flags);
896         context = iommu_context_addr(iommu, bus, devfn, 0);
897         if (context)
898                 ret = context_present(context);
899         spin_unlock_irqrestore(&iommu->lock, flags);
900         return ret;
901 }
902
903 static void free_context_table(struct intel_iommu *iommu)
904 {
905         int i;
906         unsigned long flags;
907         struct context_entry *context;
908
909         spin_lock_irqsave(&iommu->lock, flags);
910         if (!iommu->root_entry) {
911                 goto out;
912         }
913         for (i = 0; i < ROOT_ENTRY_NR; i++) {
914                 context = iommu_context_addr(iommu, i, 0, 0);
915                 if (context)
916                         free_pgtable_page(context);
917
918                 if (!ecs_enabled(iommu))
919                         continue;
920
921                 context = iommu_context_addr(iommu, i, 0x80, 0);
922                 if (context)
923                         free_pgtable_page(context);
924
925         }
926         free_pgtable_page(iommu->root_entry);
927         iommu->root_entry = NULL;
928 out:
929         spin_unlock_irqrestore(&iommu->lock, flags);
930 }
931
932 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
933                                       unsigned long pfn, int *target_level)
934 {
935         struct dma_pte *parent, *pte = NULL;
936         int level = agaw_to_level(domain->agaw);
937         int offset;
938
939         BUG_ON(!domain->pgd);
940
941         if (!domain_pfn_supported(domain, pfn))
942                 /* Address beyond IOMMU's addressing capabilities. */
943                 return NULL;
944
945         parent = domain->pgd;
946
947         while (1) {
948                 void *tmp_page;
949
950                 offset = pfn_level_offset(pfn, level);
951                 pte = &parent[offset];
952                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
953                         break;
954                 if (level == *target_level)
955                         break;
956
957                 if (!dma_pte_present(pte)) {
958                         uint64_t pteval;
959
960                         tmp_page = alloc_pgtable_page(domain->nid);
961
962                         if (!tmp_page)
963                                 return NULL;
964
965                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
966                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
967                         if (cmpxchg64(&pte->val, 0ULL, pteval))
968                                 /* Someone else set it while we were thinking; use theirs. */
969                                 free_pgtable_page(tmp_page);
970                         else
971                                 domain_flush_cache(domain, pte, sizeof(*pte));
972                 }
973                 if (level == 1)
974                         break;
975
976                 parent = phys_to_virt(dma_pte_addr(pte));
977                 level--;
978         }
979
980         if (!*target_level)
981                 *target_level = level;
982
983         return pte;
984 }
985
986
987 /* return address's pte at specific level */
988 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
989                                          unsigned long pfn,
990                                          int level, int *large_page)
991 {
992         struct dma_pte *parent, *pte = NULL;
993         int total = agaw_to_level(domain->agaw);
994         int offset;
995
996         parent = domain->pgd;
997         while (level <= total) {
998                 offset = pfn_level_offset(pfn, total);
999                 pte = &parent[offset];
1000                 if (level == total)
1001                         return pte;
1002
1003                 if (!dma_pte_present(pte)) {
1004                         *large_page = total;
1005                         break;
1006                 }
1007
1008                 if (dma_pte_superpage(pte)) {
1009                         *large_page = total;
1010                         return pte;
1011                 }
1012
1013                 parent = phys_to_virt(dma_pte_addr(pte));
1014                 total--;
1015         }
1016         return NULL;
1017 }
1018
1019 /* clear last level pte, a tlb flush should be followed */
1020 static void dma_pte_clear_range(struct dmar_domain *domain,
1021                                 unsigned long start_pfn,
1022                                 unsigned long last_pfn)
1023 {
1024         unsigned int large_page = 1;
1025         struct dma_pte *first_pte, *pte;
1026
1027         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1028         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1029         BUG_ON(start_pfn > last_pfn);
1030
1031         /* we don't need lock here; nobody else touches the iova range */
1032         do {
1033                 large_page = 1;
1034                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1035                 if (!pte) {
1036                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1037                         continue;
1038                 }
1039                 do {
1040                         dma_clear_pte(pte);
1041                         start_pfn += lvl_to_nr_pages(large_page);
1042                         pte++;
1043                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1044
1045                 domain_flush_cache(domain, first_pte,
1046                                    (void *)pte - (void *)first_pte);
1047
1048         } while (start_pfn && start_pfn <= last_pfn);
1049 }
1050
1051 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1052                                int retain_level, struct dma_pte *pte,
1053                                unsigned long pfn, unsigned long start_pfn,
1054                                unsigned long last_pfn)
1055 {
1056         pfn = max(start_pfn, pfn);
1057         pte = &pte[pfn_level_offset(pfn, level)];
1058
1059         do {
1060                 unsigned long level_pfn;
1061                 struct dma_pte *level_pte;
1062
1063                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1064                         goto next;
1065
1066                 level_pfn = pfn & level_mask(level);
1067                 level_pte = phys_to_virt(dma_pte_addr(pte));
1068
1069                 if (level > 2) {
1070                         dma_pte_free_level(domain, level - 1, retain_level,
1071                                            level_pte, level_pfn, start_pfn,
1072                                            last_pfn);
1073                 }
1074
1075                 /*
1076                  * Free the page table if we're below the level we want to
1077                  * retain and the range covers the entire table.
1078                  */
1079                 if (level < retain_level && !(start_pfn > level_pfn ||
1080                       last_pfn < level_pfn + level_size(level) - 1)) {
1081                         dma_clear_pte(pte);
1082                         domain_flush_cache(domain, pte, sizeof(*pte));
1083                         free_pgtable_page(level_pte);
1084                 }
1085 next:
1086                 pfn += level_size(level);
1087         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1088 }
1089
1090 /*
1091  * clear last level (leaf) ptes and free page table pages below the
1092  * level we wish to keep intact.
1093  */
1094 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1095                                    unsigned long start_pfn,
1096                                    unsigned long last_pfn,
1097                                    int retain_level)
1098 {
1099         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1100         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1101         BUG_ON(start_pfn > last_pfn);
1102
1103         dma_pte_clear_range(domain, start_pfn, last_pfn);
1104
1105         /* We don't need lock here; nobody else touches the iova range */
1106         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1107                            domain->pgd, 0, start_pfn, last_pfn);
1108
1109         /* free pgd */
1110         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1111                 free_pgtable_page(domain->pgd);
1112                 domain->pgd = NULL;
1113         }
1114 }
1115
1116 /* When a page at a given level is being unlinked from its parent, we don't
1117    need to *modify* it at all. All we need to do is make a list of all the
1118    pages which can be freed just as soon as we've flushed the IOTLB and we
1119    know the hardware page-walk will no longer touch them.
1120    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1121    be freed. */
1122 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1123                                             int level, struct dma_pte *pte,
1124                                             struct page *freelist)
1125 {
1126         struct page *pg;
1127
1128         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1129         pg->freelist = freelist;
1130         freelist = pg;
1131
1132         if (level == 1)
1133                 return freelist;
1134
1135         pte = page_address(pg);
1136         do {
1137                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1138                         freelist = dma_pte_list_pagetables(domain, level - 1,
1139                                                            pte, freelist);
1140                 pte++;
1141         } while (!first_pte_in_page(pte));
1142
1143         return freelist;
1144 }
1145
1146 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1147                                         struct dma_pte *pte, unsigned long pfn,
1148                                         unsigned long start_pfn,
1149                                         unsigned long last_pfn,
1150                                         struct page *freelist)
1151 {
1152         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1153
1154         pfn = max(start_pfn, pfn);
1155         pte = &pte[pfn_level_offset(pfn, level)];
1156
1157         do {
1158                 unsigned long level_pfn;
1159
1160                 if (!dma_pte_present(pte))
1161                         goto next;
1162
1163                 level_pfn = pfn & level_mask(level);
1164
1165                 /* If range covers entire pagetable, free it */
1166                 if (start_pfn <= level_pfn &&
1167                     last_pfn >= level_pfn + level_size(level) - 1) {
1168                         /* These suborbinate page tables are going away entirely. Don't
1169                            bother to clear them; we're just going to *free* them. */
1170                         if (level > 1 && !dma_pte_superpage(pte))
1171                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1172
1173                         dma_clear_pte(pte);
1174                         if (!first_pte)
1175                                 first_pte = pte;
1176                         last_pte = pte;
1177                 } else if (level > 1) {
1178                         /* Recurse down into a level that isn't *entirely* obsolete */
1179                         freelist = dma_pte_clear_level(domain, level - 1,
1180                                                        phys_to_virt(dma_pte_addr(pte)),
1181                                                        level_pfn, start_pfn, last_pfn,
1182                                                        freelist);
1183                 }
1184 next:
1185                 pfn += level_size(level);
1186         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1187
1188         if (first_pte)
1189                 domain_flush_cache(domain, first_pte,
1190                                    (void *)++last_pte - (void *)first_pte);
1191
1192         return freelist;
1193 }
1194
1195 /* We can't just free the pages because the IOMMU may still be walking
1196    the page tables, and may have cached the intermediate levels. The
1197    pages can only be freed after the IOTLB flush has been done. */
1198 static struct page *domain_unmap(struct dmar_domain *domain,
1199                                  unsigned long start_pfn,
1200                                  unsigned long last_pfn)
1201 {
1202         struct page *freelist = NULL;
1203
1204         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1205         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1206         BUG_ON(start_pfn > last_pfn);
1207
1208         /* we don't need lock here; nobody else touches the iova range */
1209         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1210                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1211
1212         /* free pgd */
1213         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1214                 struct page *pgd_page = virt_to_page(domain->pgd);
1215                 pgd_page->freelist = freelist;
1216                 freelist = pgd_page;
1217
1218                 domain->pgd = NULL;
1219         }
1220
1221         return freelist;
1222 }
1223
1224 static void dma_free_pagelist(struct page *freelist)
1225 {
1226         struct page *pg;
1227
1228         while ((pg = freelist)) {
1229                 freelist = pg->freelist;
1230                 free_pgtable_page(page_address(pg));
1231         }
1232 }
1233
1234 static void iova_entry_free(unsigned long data)
1235 {
1236         struct page *freelist = (struct page *)data;
1237
1238         dma_free_pagelist(freelist);
1239 }
1240
1241 /* iommu handling */
1242 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1243 {
1244         struct root_entry *root;
1245         unsigned long flags;
1246
1247         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1248         if (!root) {
1249                 pr_err("Allocating root entry for %s failed\n",
1250                         iommu->name);
1251                 return -ENOMEM;
1252         }
1253
1254         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1255
1256         spin_lock_irqsave(&iommu->lock, flags);
1257         iommu->root_entry = root;
1258         spin_unlock_irqrestore(&iommu->lock, flags);
1259
1260         return 0;
1261 }
1262
1263 static void iommu_set_root_entry(struct intel_iommu *iommu)
1264 {
1265         u64 addr;
1266         u32 sts;
1267         unsigned long flag;
1268
1269         addr = virt_to_phys(iommu->root_entry);
1270         if (ecs_enabled(iommu))
1271                 addr |= DMA_RTADDR_RTT;
1272
1273         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1274         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1275
1276         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1277
1278         /* Make sure hardware complete it */
1279         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1280                       readl, (sts & DMA_GSTS_RTPS), sts);
1281
1282         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283 }
1284
1285 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1286 {
1287         u32 val;
1288         unsigned long flag;
1289
1290         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1291                 return;
1292
1293         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1294         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1295
1296         /* Make sure hardware complete it */
1297         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1298                       readl, (!(val & DMA_GSTS_WBFS)), val);
1299
1300         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1301 }
1302
1303 /* return value determine if we need a write buffer flush */
1304 static void __iommu_flush_context(struct intel_iommu *iommu,
1305                                   u16 did, u16 source_id, u8 function_mask,
1306                                   u64 type)
1307 {
1308         u64 val = 0;
1309         unsigned long flag;
1310
1311         switch (type) {
1312         case DMA_CCMD_GLOBAL_INVL:
1313                 val = DMA_CCMD_GLOBAL_INVL;
1314                 break;
1315         case DMA_CCMD_DOMAIN_INVL:
1316                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1317                 break;
1318         case DMA_CCMD_DEVICE_INVL:
1319                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1320                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1321                 break;
1322         default:
1323                 BUG();
1324         }
1325         val |= DMA_CCMD_ICC;
1326
1327         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1332                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 }
1336
1337 /* return value determine if we need a write buffer flush */
1338 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1339                                 u64 addr, unsigned int size_order, u64 type)
1340 {
1341         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1342         u64 val = 0, val_iva = 0;
1343         unsigned long flag;
1344
1345         switch (type) {
1346         case DMA_TLB_GLOBAL_FLUSH:
1347                 /* global flush doesn't need set IVA_REG */
1348                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1349                 break;
1350         case DMA_TLB_DSI_FLUSH:
1351                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1352                 break;
1353         case DMA_TLB_PSI_FLUSH:
1354                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1355                 /* IH bit is passed in as part of address */
1356                 val_iva = size_order | addr;
1357                 break;
1358         default:
1359                 BUG();
1360         }
1361         /* Note: set drain read/write */
1362 #if 0
1363         /*
1364          * This is probably to be super secure.. Looks like we can
1365          * ignore it without any impact.
1366          */
1367         if (cap_read_drain(iommu->cap))
1368                 val |= DMA_TLB_READ_DRAIN;
1369 #endif
1370         if (cap_write_drain(iommu->cap))
1371                 val |= DMA_TLB_WRITE_DRAIN;
1372
1373         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374         /* Note: Only uses first TLB reg currently */
1375         if (val_iva)
1376                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1377         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1378
1379         /* Make sure hardware complete it */
1380         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1381                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1382
1383         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1384
1385         /* check IOTLB invalidation granularity */
1386         if (DMA_TLB_IAIG(val) == 0)
1387                 pr_err("Flush IOTLB failed\n");
1388         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1389                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1390                         (unsigned long long)DMA_TLB_IIRG(type),
1391                         (unsigned long long)DMA_TLB_IAIG(val));
1392 }
1393
1394 static struct device_domain_info *
1395 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1396                          u8 bus, u8 devfn)
1397 {
1398         struct device_domain_info *info;
1399
1400         assert_spin_locked(&device_domain_lock);
1401
1402         if (!iommu->qi)
1403                 return NULL;
1404
1405         list_for_each_entry(info, &domain->devices, link)
1406                 if (info->iommu == iommu && info->bus == bus &&
1407                     info->devfn == devfn) {
1408                         if (info->ats_supported && info->dev)
1409                                 return info;
1410                         break;
1411                 }
1412
1413         return NULL;
1414 }
1415
1416 static void domain_update_iotlb(struct dmar_domain *domain)
1417 {
1418         struct device_domain_info *info;
1419         bool has_iotlb_device = false;
1420
1421         assert_spin_locked(&device_domain_lock);
1422
1423         list_for_each_entry(info, &domain->devices, link) {
1424                 struct pci_dev *pdev;
1425
1426                 if (!info->dev || !dev_is_pci(info->dev))
1427                         continue;
1428
1429                 pdev = to_pci_dev(info->dev);
1430                 if (pdev->ats_enabled) {
1431                         has_iotlb_device = true;
1432                         break;
1433                 }
1434         }
1435
1436         domain->has_iotlb_device = has_iotlb_device;
1437 }
1438
1439 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1440 {
1441         struct pci_dev *pdev;
1442
1443         assert_spin_locked(&device_domain_lock);
1444
1445         if (!info || !dev_is_pci(info->dev))
1446                 return;
1447
1448         pdev = to_pci_dev(info->dev);
1449         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1450          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1451          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1452          * reserved, which should be set to 0.
1453          */
1454         if (!ecap_dit(info->iommu->ecap))
1455                 info->pfsid = 0;
1456         else {
1457                 struct pci_dev *pf_pdev;
1458
1459                 /* pdev will be returned if device is not a vf */
1460                 pf_pdev = pci_physfn(pdev);
1461                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1462         }
1463
1464 #ifdef CONFIG_INTEL_IOMMU_SVM
1465         /* The PCIe spec, in its wisdom, declares that the behaviour of
1466            the device if you enable PASID support after ATS support is
1467            undefined. So always enable PASID support on devices which
1468            have it, even if we can't yet know if we're ever going to
1469            use it. */
1470         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1471                 info->pasid_enabled = 1;
1472
1473         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1474                 info->pri_enabled = 1;
1475 #endif
1476         if (!pdev->untrusted && info->ats_supported &&
1477             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1478                 info->ats_enabled = 1;
1479                 domain_update_iotlb(info->domain);
1480                 info->ats_qdep = pci_ats_queue_depth(pdev);
1481         }
1482 }
1483
1484 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1485 {
1486         struct pci_dev *pdev;
1487
1488         assert_spin_locked(&device_domain_lock);
1489
1490         if (!dev_is_pci(info->dev))
1491                 return;
1492
1493         pdev = to_pci_dev(info->dev);
1494
1495         if (info->ats_enabled) {
1496                 pci_disable_ats(pdev);
1497                 info->ats_enabled = 0;
1498                 domain_update_iotlb(info->domain);
1499         }
1500 #ifdef CONFIG_INTEL_IOMMU_SVM
1501         if (info->pri_enabled) {
1502                 pci_disable_pri(pdev);
1503                 info->pri_enabled = 0;
1504         }
1505         if (info->pasid_enabled) {
1506                 pci_disable_pasid(pdev);
1507                 info->pasid_enabled = 0;
1508         }
1509 #endif
1510 }
1511
1512 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1513                                   u64 addr, unsigned mask)
1514 {
1515         u16 sid, qdep;
1516         unsigned long flags;
1517         struct device_domain_info *info;
1518
1519         if (!domain->has_iotlb_device)
1520                 return;
1521
1522         spin_lock_irqsave(&device_domain_lock, flags);
1523         list_for_each_entry(info, &domain->devices, link) {
1524                 if (!info->ats_enabled)
1525                         continue;
1526
1527                 sid = info->bus << 8 | info->devfn;
1528                 qdep = info->ats_qdep;
1529                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1530                                 qdep, addr, mask);
1531         }
1532         spin_unlock_irqrestore(&device_domain_lock, flags);
1533 }
1534
1535 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1536                                   struct dmar_domain *domain,
1537                                   unsigned long pfn, unsigned int pages,
1538                                   int ih, int map)
1539 {
1540         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1541         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1542         u16 did = domain->iommu_did[iommu->seq_id];
1543
1544         BUG_ON(pages == 0);
1545
1546         if (ih)
1547                 ih = 1 << 6;
1548         /*
1549          * Fallback to domain selective flush if no PSI support or the size is
1550          * too big.
1551          * PSI requires page size to be 2 ^ x, and the base address is naturally
1552          * aligned to the size
1553          */
1554         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1555                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1556                                                 DMA_TLB_DSI_FLUSH);
1557         else
1558                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1559                                                 DMA_TLB_PSI_FLUSH);
1560
1561         /*
1562          * In caching mode, changes of pages from non-present to present require
1563          * flush. However, device IOTLB doesn't need to be flushed in this case.
1564          */
1565         if (!cap_caching_mode(iommu->cap) || !map)
1566                 iommu_flush_dev_iotlb(domain, addr, mask);
1567 }
1568
1569 /* Notification for newly created mappings */
1570 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1571                                         struct dmar_domain *domain,
1572                                         unsigned long pfn, unsigned int pages)
1573 {
1574         /* It's a non-present to present mapping. Only flush if caching mode */
1575         if (cap_caching_mode(iommu->cap))
1576                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1577         else
1578                 iommu_flush_write_buffer(iommu);
1579 }
1580
1581 static void iommu_flush_iova(struct iova_domain *iovad)
1582 {
1583         struct dmar_domain *domain;
1584         int idx;
1585
1586         domain = container_of(iovad, struct dmar_domain, iovad);
1587
1588         for_each_domain_iommu(idx, domain) {
1589                 struct intel_iommu *iommu = g_iommus[idx];
1590                 u16 did = domain->iommu_did[iommu->seq_id];
1591
1592                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1593
1594                 if (!cap_caching_mode(iommu->cap))
1595                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1596                                               0, MAX_AGAW_PFN_WIDTH);
1597         }
1598 }
1599
1600 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1601 {
1602         u32 pmen;
1603         unsigned long flags;
1604
1605         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1606         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1607         pmen &= ~DMA_PMEN_EPM;
1608         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1609
1610         /* wait for the protected region status bit to clear */
1611         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1612                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1613
1614         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1615 }
1616
1617 static void iommu_enable_translation(struct intel_iommu *iommu)
1618 {
1619         u32 sts;
1620         unsigned long flags;
1621
1622         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623         iommu->gcmd |= DMA_GCMD_TE;
1624         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1625
1626         /* Make sure hardware complete it */
1627         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1628                       readl, (sts & DMA_GSTS_TES), sts);
1629
1630         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1631 }
1632
1633 static void iommu_disable_translation(struct intel_iommu *iommu)
1634 {
1635         u32 sts;
1636         unsigned long flag;
1637
1638         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1639         iommu->gcmd &= ~DMA_GCMD_TE;
1640         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1641
1642         /* Make sure hardware complete it */
1643         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1644                       readl, (!(sts & DMA_GSTS_TES)), sts);
1645
1646         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1647 }
1648
1649
1650 static int iommu_init_domains(struct intel_iommu *iommu)
1651 {
1652         u32 ndomains, nlongs;
1653         size_t size;
1654
1655         ndomains = cap_ndoms(iommu->cap);
1656         pr_debug("%s: Number of Domains supported <%d>\n",
1657                  iommu->name, ndomains);
1658         nlongs = BITS_TO_LONGS(ndomains);
1659
1660         spin_lock_init(&iommu->lock);
1661
1662         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1663         if (!iommu->domain_ids) {
1664                 pr_err("%s: Allocating domain id array failed\n",
1665                        iommu->name);
1666                 return -ENOMEM;
1667         }
1668
1669         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1670         iommu->domains = kzalloc(size, GFP_KERNEL);
1671
1672         if (iommu->domains) {
1673                 size = 256 * sizeof(struct dmar_domain *);
1674                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1675         }
1676
1677         if (!iommu->domains || !iommu->domains[0]) {
1678                 pr_err("%s: Allocating domain array failed\n",
1679                        iommu->name);
1680                 kfree(iommu->domain_ids);
1681                 kfree(iommu->domains);
1682                 iommu->domain_ids = NULL;
1683                 iommu->domains    = NULL;
1684                 return -ENOMEM;
1685         }
1686
1687
1688
1689         /*
1690          * If Caching mode is set, then invalid translations are tagged
1691          * with domain-id 0, hence we need to pre-allocate it. We also
1692          * use domain-id 0 as a marker for non-allocated domain-id, so
1693          * make sure it is not used for a real domain.
1694          */
1695         set_bit(0, iommu->domain_ids);
1696
1697         return 0;
1698 }
1699
1700 static void disable_dmar_iommu(struct intel_iommu *iommu)
1701 {
1702         struct device_domain_info *info, *tmp;
1703         unsigned long flags;
1704
1705         if (!iommu->domains || !iommu->domain_ids)
1706                 return;
1707
1708 again:
1709         spin_lock_irqsave(&device_domain_lock, flags);
1710         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1711                 struct dmar_domain *domain;
1712
1713                 if (info->iommu != iommu)
1714                         continue;
1715
1716                 if (!info->dev || !info->domain)
1717                         continue;
1718
1719                 domain = info->domain;
1720
1721                 __dmar_remove_one_dev_info(info);
1722
1723                 if (!domain_type_is_vm_or_si(domain)) {
1724                         /*
1725                          * The domain_exit() function  can't be called under
1726                          * device_domain_lock, as it takes this lock itself.
1727                          * So release the lock here and re-run the loop
1728                          * afterwards.
1729                          */
1730                         spin_unlock_irqrestore(&device_domain_lock, flags);
1731                         domain_exit(domain);
1732                         goto again;
1733                 }
1734         }
1735         spin_unlock_irqrestore(&device_domain_lock, flags);
1736
1737         if (iommu->gcmd & DMA_GCMD_TE)
1738                 iommu_disable_translation(iommu);
1739 }
1740
1741 static void free_dmar_iommu(struct intel_iommu *iommu)
1742 {
1743         if ((iommu->domains) && (iommu->domain_ids)) {
1744                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1745                 int i;
1746
1747                 for (i = 0; i < elems; i++)
1748                         kfree(iommu->domains[i]);
1749                 kfree(iommu->domains);
1750                 kfree(iommu->domain_ids);
1751                 iommu->domains = NULL;
1752                 iommu->domain_ids = NULL;
1753         }
1754
1755         g_iommus[iommu->seq_id] = NULL;
1756
1757         /* free context mapping */
1758         free_context_table(iommu);
1759
1760 #ifdef CONFIG_INTEL_IOMMU_SVM
1761         if (pasid_enabled(iommu)) {
1762                 if (ecap_prs(iommu->ecap))
1763                         intel_svm_finish_prq(iommu);
1764                 intel_svm_exit(iommu);
1765         }
1766 #endif
1767 }
1768
1769 static struct dmar_domain *alloc_domain(int flags)
1770 {
1771         struct dmar_domain *domain;
1772
1773         domain = alloc_domain_mem();
1774         if (!domain)
1775                 return NULL;
1776
1777         memset(domain, 0, sizeof(*domain));
1778         domain->nid = -1;
1779         domain->flags = flags;
1780         domain->has_iotlb_device = false;
1781         INIT_LIST_HEAD(&domain->devices);
1782
1783         return domain;
1784 }
1785
1786 /* Must be called with iommu->lock */
1787 static int domain_attach_iommu(struct dmar_domain *domain,
1788                                struct intel_iommu *iommu)
1789 {
1790         unsigned long ndomains;
1791         int num;
1792
1793         assert_spin_locked(&device_domain_lock);
1794         assert_spin_locked(&iommu->lock);
1795
1796         domain->iommu_refcnt[iommu->seq_id] += 1;
1797         domain->iommu_count += 1;
1798         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1799                 ndomains = cap_ndoms(iommu->cap);
1800                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1801
1802                 if (num >= ndomains) {
1803                         pr_err("%s: No free domain ids\n", iommu->name);
1804                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1805                         domain->iommu_count -= 1;
1806                         return -ENOSPC;
1807                 }
1808
1809                 set_bit(num, iommu->domain_ids);
1810                 set_iommu_domain(iommu, num, domain);
1811
1812                 domain->iommu_did[iommu->seq_id] = num;
1813                 domain->nid                      = iommu->node;
1814
1815                 domain_update_iommu_cap(domain);
1816         }
1817
1818         return 0;
1819 }
1820
1821 static int domain_detach_iommu(struct dmar_domain *domain,
1822                                struct intel_iommu *iommu)
1823 {
1824         int num, count = INT_MAX;
1825
1826         assert_spin_locked(&device_domain_lock);
1827         assert_spin_locked(&iommu->lock);
1828
1829         domain->iommu_refcnt[iommu->seq_id] -= 1;
1830         count = --domain->iommu_count;
1831         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1832                 num = domain->iommu_did[iommu->seq_id];
1833                 clear_bit(num, iommu->domain_ids);
1834                 set_iommu_domain(iommu, num, NULL);
1835
1836                 domain_update_iommu_cap(domain);
1837                 domain->iommu_did[iommu->seq_id] = 0;
1838         }
1839
1840         return count;
1841 }
1842
1843 static struct iova_domain reserved_iova_list;
1844 static struct lock_class_key reserved_rbtree_key;
1845
1846 static int dmar_init_reserved_ranges(void)
1847 {
1848         struct pci_dev *pdev = NULL;
1849         struct iova *iova;
1850         int i;
1851
1852         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1853
1854         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1855                 &reserved_rbtree_key);
1856
1857         /* IOAPIC ranges shouldn't be accessed by DMA */
1858         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1859                 IOVA_PFN(IOAPIC_RANGE_END));
1860         if (!iova) {
1861                 pr_err("Reserve IOAPIC range failed\n");
1862                 return -ENODEV;
1863         }
1864
1865         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1866         for_each_pci_dev(pdev) {
1867                 struct resource *r;
1868
1869                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1870                         r = &pdev->resource[i];
1871                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1872                                 continue;
1873                         iova = reserve_iova(&reserved_iova_list,
1874                                             IOVA_PFN(r->start),
1875                                             IOVA_PFN(r->end));
1876                         if (!iova) {
1877                                 pr_err("Reserve iova failed\n");
1878                                 return -ENODEV;
1879                         }
1880                 }
1881         }
1882         return 0;
1883 }
1884
1885 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1886 {
1887         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1888 }
1889
1890 static inline int guestwidth_to_adjustwidth(int gaw)
1891 {
1892         int agaw;
1893         int r = (gaw - 12) % 9;
1894
1895         if (r == 0)
1896                 agaw = gaw;
1897         else
1898                 agaw = gaw + 9 - r;
1899         if (agaw > 64)
1900                 agaw = 64;
1901         return agaw;
1902 }
1903
1904 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1905                        int guest_width)
1906 {
1907         int adjust_width, agaw;
1908         unsigned long sagaw;
1909         int err;
1910
1911         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1912
1913         err = init_iova_flush_queue(&domain->iovad,
1914                                     iommu_flush_iova, iova_entry_free);
1915         if (err)
1916                 return err;
1917
1918         domain_reserve_special_ranges(domain);
1919
1920         /* calculate AGAW */
1921         if (guest_width > cap_mgaw(iommu->cap))
1922                 guest_width = cap_mgaw(iommu->cap);
1923         domain->gaw = guest_width;
1924         adjust_width = guestwidth_to_adjustwidth(guest_width);
1925         agaw = width_to_agaw(adjust_width);
1926         sagaw = cap_sagaw(iommu->cap);
1927         if (!test_bit(agaw, &sagaw)) {
1928                 /* hardware doesn't support it, choose a bigger one */
1929                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1930                 agaw = find_next_bit(&sagaw, 5, agaw);
1931                 if (agaw >= 5)
1932                         return -ENODEV;
1933         }
1934         domain->agaw = agaw;
1935
1936         if (ecap_coherent(iommu->ecap))
1937                 domain->iommu_coherency = 1;
1938         else
1939                 domain->iommu_coherency = 0;
1940
1941         if (ecap_sc_support(iommu->ecap))
1942                 domain->iommu_snooping = 1;
1943         else
1944                 domain->iommu_snooping = 0;
1945
1946         if (intel_iommu_superpage)
1947                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1948         else
1949                 domain->iommu_superpage = 0;
1950
1951         domain->nid = iommu->node;
1952
1953         /* always allocate the top pgd */
1954         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1955         if (!domain->pgd)
1956                 return -ENOMEM;
1957         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1958         return 0;
1959 }
1960
1961 static void domain_exit(struct dmar_domain *domain)
1962 {
1963         struct page *freelist = NULL;
1964
1965         /* Domain 0 is reserved, so dont process it */
1966         if (!domain)
1967                 return;
1968
1969         /* Remove associated devices and clear attached or cached domains */
1970         rcu_read_lock();
1971         domain_remove_dev_info(domain);
1972         rcu_read_unlock();
1973
1974         /* destroy iovas */
1975         put_iova_domain(&domain->iovad);
1976
1977         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1978
1979         dma_free_pagelist(freelist);
1980
1981         free_domain_mem(domain);
1982 }
1983
1984 static int domain_context_mapping_one(struct dmar_domain *domain,
1985                                       struct intel_iommu *iommu,
1986                                       u8 bus, u8 devfn)
1987 {
1988         u16 did = domain->iommu_did[iommu->seq_id];
1989         int translation = CONTEXT_TT_MULTI_LEVEL;
1990         struct device_domain_info *info = NULL;
1991         struct context_entry *context;
1992         unsigned long flags;
1993         struct dma_pte *pgd;
1994         int ret, agaw;
1995
1996         WARN_ON(did == 0);
1997
1998         if (hw_pass_through && domain_type_is_si(domain))
1999                 translation = CONTEXT_TT_PASS_THROUGH;
2000
2001         pr_debug("Set context mapping for %02x:%02x.%d\n",
2002                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2003
2004         BUG_ON(!domain->pgd);
2005
2006         spin_lock_irqsave(&device_domain_lock, flags);
2007         spin_lock(&iommu->lock);
2008
2009         ret = -ENOMEM;
2010         context = iommu_context_addr(iommu, bus, devfn, 1);
2011         if (!context)
2012                 goto out_unlock;
2013
2014         ret = 0;
2015         if (context_present(context))
2016                 goto out_unlock;
2017
2018         /*
2019          * For kdump cases, old valid entries may be cached due to the
2020          * in-flight DMA and copied pgtable, but there is no unmapping
2021          * behaviour for them, thus we need an explicit cache flush for
2022          * the newly-mapped device. For kdump, at this point, the device
2023          * is supposed to finish reset at its driver probe stage, so no
2024          * in-flight DMA will exist, and we don't need to worry anymore
2025          * hereafter.
2026          */
2027         if (context_copied(context)) {
2028                 u16 did_old = context_domain_id(context);
2029
2030                 if (did_old < cap_ndoms(iommu->cap)) {
2031                         iommu->flush.flush_context(iommu, did_old,
2032                                                    (((u16)bus) << 8) | devfn,
2033                                                    DMA_CCMD_MASK_NOBIT,
2034                                                    DMA_CCMD_DEVICE_INVL);
2035                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2036                                                  DMA_TLB_DSI_FLUSH);
2037                 }
2038         }
2039
2040         pgd = domain->pgd;
2041
2042         context_clear_entry(context);
2043         context_set_domain_id(context, did);
2044
2045         /*
2046          * Skip top levels of page tables for iommu which has less agaw
2047          * than default.  Unnecessary for PT mode.
2048          */
2049         if (translation != CONTEXT_TT_PASS_THROUGH) {
2050                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2051                         ret = -ENOMEM;
2052                         pgd = phys_to_virt(dma_pte_addr(pgd));
2053                         if (!dma_pte_present(pgd))
2054                                 goto out_unlock;
2055                 }
2056
2057                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2058                 if (info && info->ats_supported)
2059                         translation = CONTEXT_TT_DEV_IOTLB;
2060                 else
2061                         translation = CONTEXT_TT_MULTI_LEVEL;
2062
2063                 context_set_address_root(context, virt_to_phys(pgd));
2064                 context_set_address_width(context, iommu->agaw);
2065         } else {
2066                 /*
2067                  * In pass through mode, AW must be programmed to
2068                  * indicate the largest AGAW value supported by
2069                  * hardware. And ASR is ignored by hardware.
2070                  */
2071                 context_set_address_width(context, iommu->msagaw);
2072         }
2073
2074         context_set_translation_type(context, translation);
2075         context_set_fault_enable(context);
2076         context_set_present(context);
2077         domain_flush_cache(domain, context, sizeof(*context));
2078
2079         /*
2080          * It's a non-present to present mapping. If hardware doesn't cache
2081          * non-present entry we only need to flush the write-buffer. If the
2082          * _does_ cache non-present entries, then it does so in the special
2083          * domain #0, which we have to flush:
2084          */
2085         if (cap_caching_mode(iommu->cap)) {
2086                 iommu->flush.flush_context(iommu, 0,
2087                                            (((u16)bus) << 8) | devfn,
2088                                            DMA_CCMD_MASK_NOBIT,
2089                                            DMA_CCMD_DEVICE_INVL);
2090                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2091         } else {
2092                 iommu_flush_write_buffer(iommu);
2093         }
2094         iommu_enable_dev_iotlb(info);
2095
2096         ret = 0;
2097
2098 out_unlock:
2099         spin_unlock(&iommu->lock);
2100         spin_unlock_irqrestore(&device_domain_lock, flags);
2101
2102         return ret;
2103 }
2104
2105 struct domain_context_mapping_data {
2106         struct dmar_domain *domain;
2107         struct intel_iommu *iommu;
2108 };
2109
2110 static int domain_context_mapping_cb(struct pci_dev *pdev,
2111                                      u16 alias, void *opaque)
2112 {
2113         struct domain_context_mapping_data *data = opaque;
2114
2115         return domain_context_mapping_one(data->domain, data->iommu,
2116                                           PCI_BUS_NUM(alias), alias & 0xff);
2117 }
2118
2119 static int
2120 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2121 {
2122         struct intel_iommu *iommu;
2123         u8 bus, devfn;
2124         struct domain_context_mapping_data data;
2125
2126         iommu = device_to_iommu(dev, &bus, &devfn);
2127         if (!iommu)
2128                 return -ENODEV;
2129
2130         if (!dev_is_pci(dev))
2131                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2132
2133         data.domain = domain;
2134         data.iommu = iommu;
2135
2136         return pci_for_each_dma_alias(to_pci_dev(dev),
2137                                       &domain_context_mapping_cb, &data);
2138 }
2139
2140 static int domain_context_mapped_cb(struct pci_dev *pdev,
2141                                     u16 alias, void *opaque)
2142 {
2143         struct intel_iommu *iommu = opaque;
2144
2145         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2146 }
2147
2148 static int domain_context_mapped(struct device *dev)
2149 {
2150         struct intel_iommu *iommu;
2151         u8 bus, devfn;
2152
2153         iommu = device_to_iommu(dev, &bus, &devfn);
2154         if (!iommu)
2155                 return -ENODEV;
2156
2157         if (!dev_is_pci(dev))
2158                 return device_context_mapped(iommu, bus, devfn);
2159
2160         return !pci_for_each_dma_alias(to_pci_dev(dev),
2161                                        domain_context_mapped_cb, iommu);
2162 }
2163
2164 /* Returns a number of VTD pages, but aligned to MM page size */
2165 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2166                                             size_t size)
2167 {
2168         host_addr &= ~PAGE_MASK;
2169         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2170 }
2171
2172 /* Return largest possible superpage level for a given mapping */
2173 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2174                                           unsigned long iov_pfn,
2175                                           unsigned long phy_pfn,
2176                                           unsigned long pages)
2177 {
2178         int support, level = 1;
2179         unsigned long pfnmerge;
2180
2181         support = domain->iommu_superpage;
2182
2183         /* To use a large page, the virtual *and* physical addresses
2184            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2185            of them will mean we have to use smaller pages. So just
2186            merge them and check both at once. */
2187         pfnmerge = iov_pfn | phy_pfn;
2188
2189         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2190                 pages >>= VTD_STRIDE_SHIFT;
2191                 if (!pages)
2192                         break;
2193                 pfnmerge >>= VTD_STRIDE_SHIFT;
2194                 level++;
2195                 support--;
2196         }
2197         return level;
2198 }
2199
2200 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2201                             struct scatterlist *sg, unsigned long phys_pfn,
2202                             unsigned long nr_pages, int prot)
2203 {
2204         struct dma_pte *first_pte = NULL, *pte = NULL;
2205         phys_addr_t uninitialized_var(pteval);
2206         unsigned long sg_res = 0;
2207         unsigned int largepage_lvl = 0;
2208         unsigned long lvl_pages = 0;
2209
2210         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2211
2212         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2213                 return -EINVAL;
2214
2215         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2216
2217         if (!sg) {
2218                 sg_res = nr_pages;
2219                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2220         }
2221
2222         while (nr_pages > 0) {
2223                 uint64_t tmp;
2224
2225                 if (!sg_res) {
2226                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2227
2228                         sg_res = aligned_nrpages(sg->offset, sg->length);
2229                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2230                         sg->dma_length = sg->length;
2231                         pteval = (sg_phys(sg) - pgoff) | prot;
2232                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2233                 }
2234
2235                 if (!pte) {
2236                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2237
2238                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2239                         if (!pte)
2240                                 return -ENOMEM;
2241                         /* It is large page*/
2242                         if (largepage_lvl > 1) {
2243                                 unsigned long nr_superpages, end_pfn;
2244
2245                                 pteval |= DMA_PTE_LARGE_PAGE;
2246                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2247
2248                                 nr_superpages = sg_res / lvl_pages;
2249                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2250
2251                                 /*
2252                                  * Ensure that old small page tables are
2253                                  * removed to make room for superpage(s).
2254                                  * We're adding new large pages, so make sure
2255                                  * we don't remove their parent tables.
2256                                  */
2257                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2258                                                        largepage_lvl + 1);
2259                         } else {
2260                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2261                         }
2262
2263                 }
2264                 /* We don't need lock here, nobody else
2265                  * touches the iova range
2266                  */
2267                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2268                 if (tmp) {
2269                         static int dumps = 5;
2270                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2271                                 iov_pfn, tmp, (unsigned long long)pteval);
2272                         if (dumps) {
2273                                 dumps--;
2274                                 debug_dma_dump_mappings(NULL);
2275                         }
2276                         WARN_ON(1);
2277                 }
2278
2279                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2280
2281                 BUG_ON(nr_pages < lvl_pages);
2282                 BUG_ON(sg_res < lvl_pages);
2283
2284                 nr_pages -= lvl_pages;
2285                 iov_pfn += lvl_pages;
2286                 phys_pfn += lvl_pages;
2287                 pteval += lvl_pages * VTD_PAGE_SIZE;
2288                 sg_res -= lvl_pages;
2289
2290                 /* If the next PTE would be the first in a new page, then we
2291                    need to flush the cache on the entries we've just written.
2292                    And then we'll need to recalculate 'pte', so clear it and
2293                    let it get set again in the if (!pte) block above.
2294
2295                    If we're done (!nr_pages) we need to flush the cache too.
2296
2297                    Also if we've been setting superpages, we may need to
2298                    recalculate 'pte' and switch back to smaller pages for the
2299                    end of the mapping, if the trailing size is not enough to
2300                    use another superpage (i.e. sg_res < lvl_pages). */
2301                 pte++;
2302                 if (!nr_pages || first_pte_in_page(pte) ||
2303                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2304                         domain_flush_cache(domain, first_pte,
2305                                            (void *)pte - (void *)first_pte);
2306                         pte = NULL;
2307                 }
2308
2309                 if (!sg_res && nr_pages)
2310                         sg = sg_next(sg);
2311         }
2312         return 0;
2313 }
2314
2315 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2316                          struct scatterlist *sg, unsigned long phys_pfn,
2317                          unsigned long nr_pages, int prot)
2318 {
2319        int ret;
2320        struct intel_iommu *iommu;
2321
2322        /* Do the real mapping first */
2323        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2324        if (ret)
2325                return ret;
2326
2327        /* Notify about the new mapping */
2328        if (domain_type_is_vm(domain)) {
2329                /* VM typed domains can have more than one IOMMUs */
2330                int iommu_id;
2331                for_each_domain_iommu(iommu_id, domain) {
2332                        iommu = g_iommus[iommu_id];
2333                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2334                }
2335        } else {
2336                /* General domains only have one IOMMU */
2337                iommu = domain_get_iommu(domain);
2338                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2339        }
2340
2341        return 0;
2342 }
2343
2344 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2345                                     struct scatterlist *sg, unsigned long nr_pages,
2346                                     int prot)
2347 {
2348         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2349 }
2350
2351 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2352                                      unsigned long phys_pfn, unsigned long nr_pages,
2353                                      int prot)
2354 {
2355         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2356 }
2357
2358 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2359 {
2360         unsigned long flags;
2361         struct context_entry *context;
2362         u16 did_old;
2363
2364         if (!iommu)
2365                 return;
2366
2367         spin_lock_irqsave(&iommu->lock, flags);
2368         context = iommu_context_addr(iommu, bus, devfn, 0);
2369         if (!context) {
2370                 spin_unlock_irqrestore(&iommu->lock, flags);
2371                 return;
2372         }
2373         did_old = context_domain_id(context);
2374         context_clear_entry(context);
2375         __iommu_flush_cache(iommu, context, sizeof(*context));
2376         spin_unlock_irqrestore(&iommu->lock, flags);
2377         iommu->flush.flush_context(iommu,
2378                                    did_old,
2379                                    (((u16)bus) << 8) | devfn,
2380                                    DMA_CCMD_MASK_NOBIT,
2381                                    DMA_CCMD_DEVICE_INVL);
2382         iommu->flush.flush_iotlb(iommu,
2383                                  did_old,
2384                                  0,
2385                                  0,
2386                                  DMA_TLB_DSI_FLUSH);
2387 }
2388
2389 static inline void unlink_domain_info(struct device_domain_info *info)
2390 {
2391         assert_spin_locked(&device_domain_lock);
2392         list_del(&info->link);
2393         list_del(&info->global);
2394         if (info->dev)
2395                 info->dev->archdata.iommu = NULL;
2396 }
2397
2398 static void domain_remove_dev_info(struct dmar_domain *domain)
2399 {
2400         struct device_domain_info *info, *tmp;
2401         unsigned long flags;
2402
2403         spin_lock_irqsave(&device_domain_lock, flags);
2404         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2405                 __dmar_remove_one_dev_info(info);
2406         spin_unlock_irqrestore(&device_domain_lock, flags);
2407 }
2408
2409 /*
2410  * find_domain
2411  * Note: we use struct device->archdata.iommu stores the info
2412  */
2413 static struct dmar_domain *find_domain(struct device *dev)
2414 {
2415         struct device_domain_info *info;
2416
2417         /* No lock here, assumes no domain exit in normal case */
2418         info = dev->archdata.iommu;
2419         if (likely(info))
2420                 return info->domain;
2421         return NULL;
2422 }
2423
2424 static inline struct device_domain_info *
2425 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2426 {
2427         struct device_domain_info *info;
2428
2429         list_for_each_entry(info, &device_domain_list, global)
2430                 if (info->iommu->segment == segment && info->bus == bus &&
2431                     info->devfn == devfn)
2432                         return info;
2433
2434         return NULL;
2435 }
2436
2437 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2438                                                     int bus, int devfn,
2439                                                     struct device *dev,
2440                                                     struct dmar_domain *domain)
2441 {
2442         struct dmar_domain *found = NULL;
2443         struct device_domain_info *info;
2444         unsigned long flags;
2445         int ret;
2446
2447         info = alloc_devinfo_mem();
2448         if (!info)
2449                 return NULL;
2450
2451         info->bus = bus;
2452         info->devfn = devfn;
2453         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2454         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2455         info->ats_qdep = 0;
2456         info->dev = dev;
2457         info->domain = domain;
2458         info->iommu = iommu;
2459         info->pasid_table = NULL;
2460
2461         if (dev && dev_is_pci(dev)) {
2462                 struct pci_dev *pdev = to_pci_dev(info->dev);
2463
2464                 if (!pci_ats_disabled() &&
2465                     ecap_dev_iotlb_support(iommu->ecap) &&
2466                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2467                     dmar_find_matched_atsr_unit(pdev))
2468                         info->ats_supported = 1;
2469
2470                 if (ecs_enabled(iommu)) {
2471                         if (pasid_enabled(iommu)) {
2472                                 int features = pci_pasid_features(pdev);
2473                                 if (features >= 0)
2474                                         info->pasid_supported = features | 1;
2475                         }
2476
2477                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2478                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2479                                 info->pri_supported = 1;
2480                 }
2481         }
2482
2483         spin_lock_irqsave(&device_domain_lock, flags);
2484         if (dev)
2485                 found = find_domain(dev);
2486
2487         if (!found) {
2488                 struct device_domain_info *info2;
2489                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2490                 if (info2) {
2491                         found      = info2->domain;
2492                         info2->dev = dev;
2493                 }
2494         }
2495
2496         if (found) {
2497                 spin_unlock_irqrestore(&device_domain_lock, flags);
2498                 free_devinfo_mem(info);
2499                 /* Caller must free the original domain */
2500                 return found;
2501         }
2502
2503         spin_lock(&iommu->lock);
2504         ret = domain_attach_iommu(domain, iommu);
2505         spin_unlock(&iommu->lock);
2506
2507         if (ret) {
2508                 spin_unlock_irqrestore(&device_domain_lock, flags);
2509                 free_devinfo_mem(info);
2510                 return NULL;
2511         }
2512
2513         list_add(&info->link, &domain->devices);
2514         list_add(&info->global, &device_domain_list);
2515         if (dev)
2516                 dev->archdata.iommu = info;
2517
2518         if (dev && dev_is_pci(dev) && info->pasid_supported) {
2519                 ret = intel_pasid_alloc_table(dev);
2520                 if (ret) {
2521                         pr_warn("No pasid table for %s, pasid disabled\n",
2522                                 dev_name(dev));
2523                         info->pasid_supported = 0;
2524                 }
2525         }
2526         spin_unlock_irqrestore(&device_domain_lock, flags);
2527
2528         if (dev && domain_context_mapping(domain, dev)) {
2529                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2530                 dmar_remove_one_dev_info(domain, dev);
2531                 return NULL;
2532         }
2533
2534         return domain;
2535 }
2536
2537 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2538 {
2539         *(u16 *)opaque = alias;
2540         return 0;
2541 }
2542
2543 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2544 {
2545         struct device_domain_info *info = NULL;
2546         struct dmar_domain *domain = NULL;
2547         struct intel_iommu *iommu;
2548         u16 dma_alias;
2549         unsigned long flags;
2550         u8 bus, devfn;
2551
2552         iommu = device_to_iommu(dev, &bus, &devfn);
2553         if (!iommu)
2554                 return NULL;
2555
2556         if (dev_is_pci(dev)) {
2557                 struct pci_dev *pdev = to_pci_dev(dev);
2558
2559                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2560
2561                 spin_lock_irqsave(&device_domain_lock, flags);
2562                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2563                                                       PCI_BUS_NUM(dma_alias),
2564                                                       dma_alias & 0xff);
2565                 if (info) {
2566                         iommu = info->iommu;
2567                         domain = info->domain;
2568                 }
2569                 spin_unlock_irqrestore(&device_domain_lock, flags);
2570
2571                 /* DMA alias already has a domain, use it */
2572                 if (info)
2573                         goto out;
2574         }
2575
2576         /* Allocate and initialize new domain for the device */
2577         domain = alloc_domain(0);
2578         if (!domain)
2579                 return NULL;
2580         if (domain_init(domain, iommu, gaw)) {
2581                 domain_exit(domain);
2582                 return NULL;
2583         }
2584
2585 out:
2586
2587         return domain;
2588 }
2589
2590 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2591                                               struct dmar_domain *domain)
2592 {
2593         struct intel_iommu *iommu;
2594         struct dmar_domain *tmp;
2595         u16 req_id, dma_alias;
2596         u8 bus, devfn;
2597
2598         iommu = device_to_iommu(dev, &bus, &devfn);
2599         if (!iommu)
2600                 return NULL;
2601
2602         req_id = ((u16)bus << 8) | devfn;
2603
2604         if (dev_is_pci(dev)) {
2605                 struct pci_dev *pdev = to_pci_dev(dev);
2606
2607                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2608
2609                 /* register PCI DMA alias device */
2610                 if (req_id != dma_alias) {
2611                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2612                                         dma_alias & 0xff, NULL, domain);
2613
2614                         if (!tmp || tmp != domain)
2615                                 return tmp;
2616                 }
2617         }
2618
2619         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2620         if (!tmp || tmp != domain)
2621                 return tmp;
2622
2623         return domain;
2624 }
2625
2626 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2627 {
2628         struct dmar_domain *domain, *tmp;
2629
2630         domain = find_domain(dev);
2631         if (domain)
2632                 goto out;
2633
2634         domain = find_or_alloc_domain(dev, gaw);
2635         if (!domain)
2636                 goto out;
2637
2638         tmp = set_domain_for_dev(dev, domain);
2639         if (!tmp || domain != tmp) {
2640                 domain_exit(domain);
2641                 domain = tmp;
2642         }
2643
2644 out:
2645
2646         return domain;
2647 }
2648
2649 static int iommu_domain_identity_map(struct dmar_domain *domain,
2650                                      unsigned long long start,
2651                                      unsigned long long end)
2652 {
2653         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2654         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2655
2656         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2657                           dma_to_mm_pfn(last_vpfn))) {
2658                 pr_err("Reserving iova failed\n");
2659                 return -ENOMEM;
2660         }
2661
2662         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2663         /*
2664          * RMRR range might have overlap with physical memory range,
2665          * clear it first
2666          */
2667         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2668
2669         return __domain_mapping(domain, first_vpfn, NULL,
2670                                 first_vpfn, last_vpfn - first_vpfn + 1,
2671                                 DMA_PTE_READ|DMA_PTE_WRITE);
2672 }
2673
2674 static int domain_prepare_identity_map(struct device *dev,
2675                                        struct dmar_domain *domain,
2676                                        unsigned long long start,
2677                                        unsigned long long end)
2678 {
2679         /* For _hardware_ passthrough, don't bother. But for software
2680            passthrough, we do it anyway -- it may indicate a memory
2681            range which is reserved in E820, so which didn't get set
2682            up to start with in si_domain */
2683         if (domain == si_domain && hw_pass_through) {
2684                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2685                         dev_name(dev), start, end);
2686                 return 0;
2687         }
2688
2689         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2690                 dev_name(dev), start, end);
2691
2692         if (end < start) {
2693                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2694                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2695                         dmi_get_system_info(DMI_BIOS_VENDOR),
2696                         dmi_get_system_info(DMI_BIOS_VERSION),
2697                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2698                 return -EIO;
2699         }
2700
2701         if (end >> agaw_to_width(domain->agaw)) {
2702                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2703                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2704                      agaw_to_width(domain->agaw),
2705                      dmi_get_system_info(DMI_BIOS_VENDOR),
2706                      dmi_get_system_info(DMI_BIOS_VERSION),
2707                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2708                 return -EIO;
2709         }
2710
2711         return iommu_domain_identity_map(domain, start, end);
2712 }
2713
2714 static int iommu_prepare_identity_map(struct device *dev,
2715                                       unsigned long long start,
2716                                       unsigned long long end)
2717 {
2718         struct dmar_domain *domain;
2719         int ret;
2720
2721         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2722         if (!domain)
2723                 return -ENOMEM;
2724
2725         ret = domain_prepare_identity_map(dev, domain, start, end);
2726         if (ret)
2727                 domain_exit(domain);
2728
2729         return ret;
2730 }
2731
2732 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2733                                          struct device *dev)
2734 {
2735         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2736                 return 0;
2737         return iommu_prepare_identity_map(dev, rmrr->base_address,
2738                                           rmrr->end_address);
2739 }
2740
2741 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2742 static inline void iommu_prepare_isa(void)
2743 {
2744         struct pci_dev *pdev;
2745         int ret;
2746
2747         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2748         if (!pdev)
2749                 return;
2750
2751         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2752         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2753
2754         if (ret)
2755                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2756
2757         pci_dev_put(pdev);
2758 }
2759 #else
2760 static inline void iommu_prepare_isa(void)
2761 {
2762         return;
2763 }
2764 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2765
2766 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2767
2768 static int __init si_domain_init(int hw)
2769 {
2770         int nid, ret = 0;
2771
2772         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2773         if (!si_domain)
2774                 return -EFAULT;
2775
2776         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2777                 domain_exit(si_domain);
2778                 return -EFAULT;
2779         }
2780
2781         pr_debug("Identity mapping domain allocated\n");
2782
2783         if (hw)
2784                 return 0;
2785
2786         for_each_online_node(nid) {
2787                 unsigned long start_pfn, end_pfn;
2788                 int i;
2789
2790                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2791                         ret = iommu_domain_identity_map(si_domain,
2792                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2793                         if (ret)
2794                                 return ret;
2795                 }
2796         }
2797
2798         return 0;
2799 }
2800
2801 static int identity_mapping(struct device *dev)
2802 {
2803         struct device_domain_info *info;
2804
2805         if (likely(!iommu_identity_mapping))
2806                 return 0;
2807
2808         info = dev->archdata.iommu;
2809         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2810                 return (info->domain == si_domain);
2811
2812         return 0;
2813 }
2814
2815 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2816 {
2817         struct dmar_domain *ndomain;
2818         struct intel_iommu *iommu;
2819         u8 bus, devfn;
2820
2821         iommu = device_to_iommu(dev, &bus, &devfn);
2822         if (!iommu)
2823                 return -ENODEV;
2824
2825         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2826         if (ndomain != domain)
2827                 return -EBUSY;
2828
2829         return 0;
2830 }
2831
2832 static bool device_has_rmrr(struct device *dev)
2833 {
2834         struct dmar_rmrr_unit *rmrr;
2835         struct device *tmp;
2836         int i;
2837
2838         rcu_read_lock();
2839         for_each_rmrr_units(rmrr) {
2840                 /*
2841                  * Return TRUE if this RMRR contains the device that
2842                  * is passed in.
2843                  */
2844                 for_each_active_dev_scope(rmrr->devices,
2845                                           rmrr->devices_cnt, i, tmp)
2846                         if (tmp == dev) {
2847                                 rcu_read_unlock();
2848                                 return true;
2849                         }
2850         }
2851         rcu_read_unlock();
2852         return false;
2853 }
2854
2855 /*
2856  * There are a couple cases where we need to restrict the functionality of
2857  * devices associated with RMRRs.  The first is when evaluating a device for
2858  * identity mapping because problems exist when devices are moved in and out
2859  * of domains and their respective RMRR information is lost.  This means that
2860  * a device with associated RMRRs will never be in a "passthrough" domain.
2861  * The second is use of the device through the IOMMU API.  This interface
2862  * expects to have full control of the IOVA space for the device.  We cannot
2863  * satisfy both the requirement that RMRR access is maintained and have an
2864  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2865  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2866  * We therefore prevent devices associated with an RMRR from participating in
2867  * the IOMMU API, which eliminates them from device assignment.
2868  *
2869  * In both cases we assume that PCI USB devices with RMRRs have them largely
2870  * for historical reasons and that the RMRR space is not actively used post
2871  * boot.  This exclusion may change if vendors begin to abuse it.
2872  *
2873  * The same exception is made for graphics devices, with the requirement that
2874  * any use of the RMRR regions will be torn down before assigning the device
2875  * to a guest.
2876  */
2877 static bool device_is_rmrr_locked(struct device *dev)
2878 {
2879         if (!device_has_rmrr(dev))
2880                 return false;
2881
2882         if (dev_is_pci(dev)) {
2883                 struct pci_dev *pdev = to_pci_dev(dev);
2884
2885                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2886                         return false;
2887         }
2888
2889         return true;
2890 }
2891
2892 static int iommu_should_identity_map(struct device *dev, int startup)
2893 {
2894
2895         if (dev_is_pci(dev)) {
2896                 struct pci_dev *pdev = to_pci_dev(dev);
2897
2898                 if (device_is_rmrr_locked(dev))
2899                         return 0;
2900
2901                 /*
2902                  * Prevent any device marked as untrusted from getting
2903                  * placed into the statically identity mapping domain.
2904                  */
2905                 if (pdev->untrusted)
2906                         return 0;
2907
2908                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2909                         return 1;
2910
2911                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2912                         return 1;
2913
2914                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2915                         return 0;
2916
2917                 /*
2918                  * We want to start off with all devices in the 1:1 domain, and
2919                  * take them out later if we find they can't access all of memory.
2920                  *
2921                  * However, we can't do this for PCI devices behind bridges,
2922                  * because all PCI devices behind the same bridge will end up
2923                  * with the same source-id on their transactions.
2924                  *
2925                  * Practically speaking, we can't change things around for these
2926                  * devices at run-time, because we can't be sure there'll be no
2927                  * DMA transactions in flight for any of their siblings.
2928                  *
2929                  * So PCI devices (unless they're on the root bus) as well as
2930                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2931                  * the 1:1 domain, just in _case_ one of their siblings turns out
2932                  * not to be able to map all of memory.
2933                  */
2934                 if (!pci_is_pcie(pdev)) {
2935                         if (!pci_is_root_bus(pdev->bus))
2936                                 return 0;
2937                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2938                                 return 0;
2939                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2940                         return 0;
2941         } else {
2942                 if (device_has_rmrr(dev))
2943                         return 0;
2944         }
2945
2946         /*
2947          * At boot time, we don't yet know if devices will be 64-bit capable.
2948          * Assume that they will — if they turn out not to be, then we can
2949          * take them out of the 1:1 domain later.
2950          */
2951         if (!startup) {
2952                 /*
2953                  * If the device's dma_mask is less than the system's memory
2954                  * size then this is not a candidate for identity mapping.
2955                  */
2956                 u64 dma_mask = *dev->dma_mask;
2957
2958                 if (dev->coherent_dma_mask &&
2959                     dev->coherent_dma_mask < dma_mask)
2960                         dma_mask = dev->coherent_dma_mask;
2961
2962                 return dma_mask >= dma_get_required_mask(dev);
2963         }
2964
2965         return 1;
2966 }
2967
2968 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2969 {
2970         int ret;
2971
2972         if (!iommu_should_identity_map(dev, 1))
2973                 return 0;
2974
2975         ret = domain_add_dev_info(si_domain, dev);
2976         if (!ret)
2977                 pr_info("%s identity mapping for device %s\n",
2978                         hw ? "Hardware" : "Software", dev_name(dev));
2979         else if (ret == -ENODEV)
2980                 /* device not associated with an iommu */
2981                 ret = 0;
2982
2983         return ret;
2984 }
2985
2986
2987 static int __init iommu_prepare_static_identity_mapping(int hw)
2988 {
2989         struct pci_dev *pdev = NULL;
2990         struct dmar_drhd_unit *drhd;
2991         struct intel_iommu *iommu;
2992         struct device *dev;
2993         int i;
2994         int ret = 0;
2995
2996         for_each_pci_dev(pdev) {
2997                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2998                 if (ret)
2999                         return ret;
3000         }
3001
3002         for_each_active_iommu(iommu, drhd)
3003                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3004                         struct acpi_device_physical_node *pn;
3005                         struct acpi_device *adev;
3006
3007                         if (dev->bus != &acpi_bus_type)
3008                                 continue;
3009
3010                         adev= to_acpi_device(dev);
3011                         mutex_lock(&adev->physical_node_lock);
3012                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3013                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3014                                 if (ret)
3015                                         break;
3016                         }
3017                         mutex_unlock(&adev->physical_node_lock);
3018                         if (ret)
3019                                 return ret;
3020                 }
3021
3022         return 0;
3023 }
3024
3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3026 {
3027         /*
3028          * Start from the sane iommu hardware state.
3029          * If the queued invalidation is already initialized by us
3030          * (for example, while enabling interrupt-remapping) then
3031          * we got the things already rolling from a sane state.
3032          */
3033         if (!iommu->qi) {
3034                 /*
3035                  * Clear any previous faults.
3036                  */
3037                 dmar_fault(-1, iommu);
3038                 /*
3039                  * Disable queued invalidation if supported and already enabled
3040                  * before OS handover.
3041                  */
3042                 dmar_disable_qi(iommu);
3043         }
3044
3045         if (dmar_enable_qi(iommu)) {
3046                 /*
3047                  * Queued Invalidate not enabled, use Register Based Invalidate
3048                  */
3049                 iommu->flush.flush_context = __iommu_flush_context;
3050                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051                 pr_info("%s: Using Register based invalidation\n",
3052                         iommu->name);
3053         } else {
3054                 iommu->flush.flush_context = qi_flush_context;
3055                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3056                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3057         }
3058 }
3059
3060 static int copy_context_table(struct intel_iommu *iommu,
3061                               struct root_entry *old_re,
3062                               struct context_entry **tbl,
3063                               int bus, bool ext)
3064 {
3065         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066         struct context_entry *new_ce = NULL, ce;
3067         struct context_entry *old_ce = NULL;
3068         struct root_entry re;
3069         phys_addr_t old_ce_phys;
3070
3071         tbl_idx = ext ? bus * 2 : bus;
3072         memcpy(&re, old_re, sizeof(re));
3073
3074         for (devfn = 0; devfn < 256; devfn++) {
3075                 /* First calculate the correct index */
3076                 idx = (ext ? devfn * 2 : devfn) % 256;
3077
3078                 if (idx == 0) {
3079                         /* First save what we may have and clean up */
3080                         if (new_ce) {
3081                                 tbl[tbl_idx] = new_ce;
3082                                 __iommu_flush_cache(iommu, new_ce,
3083                                                     VTD_PAGE_SIZE);
3084                                 pos = 1;
3085                         }
3086
3087                         if (old_ce)
3088                                 memunmap(old_ce);
3089
3090                         ret = 0;
3091                         if (devfn < 0x80)
3092                                 old_ce_phys = root_entry_lctp(&re);
3093                         else
3094                                 old_ce_phys = root_entry_uctp(&re);
3095
3096                         if (!old_ce_phys) {
3097                                 if (ext && devfn == 0) {
3098                                         /* No LCTP, try UCTP */
3099                                         devfn = 0x7f;
3100                                         continue;
3101                                 } else {
3102                                         goto out;
3103                                 }
3104                         }
3105
3106                         ret = -ENOMEM;
3107                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3108                                         MEMREMAP_WB);
3109                         if (!old_ce)
3110                                 goto out;
3111
3112                         new_ce = alloc_pgtable_page(iommu->node);
3113                         if (!new_ce)
3114                                 goto out_unmap;
3115
3116                         ret = 0;
3117                 }
3118
3119                 /* Now copy the context entry */
3120                 memcpy(&ce, old_ce + idx, sizeof(ce));
3121
3122                 if (!__context_present(&ce))
3123                         continue;
3124
3125                 did = context_domain_id(&ce);
3126                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3127                         set_bit(did, iommu->domain_ids);
3128
3129                 /*
3130                  * We need a marker for copied context entries. This
3131                  * marker needs to work for the old format as well as
3132                  * for extended context entries.
3133                  *
3134                  * Bit 67 of the context entry is used. In the old
3135                  * format this bit is available to software, in the
3136                  * extended format it is the PGE bit, but PGE is ignored
3137                  * by HW if PASIDs are disabled (and thus still
3138                  * available).
3139                  *
3140                  * So disable PASIDs first and then mark the entry
3141                  * copied. This means that we don't copy PASID
3142                  * translations from the old kernel, but this is fine as
3143                  * faults there are not fatal.
3144                  */
3145                 context_clear_pasid_enable(&ce);
3146                 context_set_copied(&ce);
3147
3148                 new_ce[idx] = ce;
3149         }
3150
3151         tbl[tbl_idx + pos] = new_ce;
3152
3153         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3154
3155 out_unmap:
3156         memunmap(old_ce);
3157
3158 out:
3159         return ret;
3160 }
3161
3162 static int copy_translation_tables(struct intel_iommu *iommu)
3163 {
3164         struct context_entry **ctxt_tbls;
3165         struct root_entry *old_rt;
3166         phys_addr_t old_rt_phys;
3167         int ctxt_table_entries;
3168         unsigned long flags;
3169         u64 rtaddr_reg;
3170         int bus, ret;
3171         bool new_ext, ext;
3172
3173         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175         new_ext    = !!ecap_ecs(iommu->ecap);
3176
3177         /*
3178          * The RTT bit can only be changed when translation is disabled,
3179          * but disabling translation means to open a window for data
3180          * corruption. So bail out and don't copy anything if we would
3181          * have to change the bit.
3182          */
3183         if (new_ext != ext)
3184                 return -EINVAL;
3185
3186         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3187         if (!old_rt_phys)
3188                 return -EINVAL;
3189
3190         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3191         if (!old_rt)
3192                 return -ENOMEM;
3193
3194         /* This is too big for the stack - allocate it from slab */
3195         ctxt_table_entries = ext ? 512 : 256;
3196         ret = -ENOMEM;
3197         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3198         if (!ctxt_tbls)
3199                 goto out_unmap;
3200
3201         for (bus = 0; bus < 256; bus++) {
3202                 ret = copy_context_table(iommu, &old_rt[bus],
3203                                          ctxt_tbls, bus, ext);
3204                 if (ret) {
3205                         pr_err("%s: Failed to copy context table for bus %d\n",
3206                                 iommu->name, bus);
3207                         continue;
3208                 }
3209         }
3210
3211         spin_lock_irqsave(&iommu->lock, flags);
3212
3213         /* Context tables are copied, now write them to the root_entry table */
3214         for (bus = 0; bus < 256; bus++) {
3215                 int idx = ext ? bus * 2 : bus;
3216                 u64 val;
3217
3218                 if (ctxt_tbls[idx]) {
3219                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220                         iommu->root_entry[bus].lo = val;
3221                 }
3222
3223                 if (!ext || !ctxt_tbls[idx + 1])
3224                         continue;
3225
3226                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227                 iommu->root_entry[bus].hi = val;
3228         }
3229
3230         spin_unlock_irqrestore(&iommu->lock, flags);
3231
3232         kfree(ctxt_tbls);
3233
3234         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3235
3236         ret = 0;
3237
3238 out_unmap:
3239         memunmap(old_rt);
3240
3241         return ret;
3242 }
3243
3244 static int __init init_dmars(void)
3245 {
3246         struct dmar_drhd_unit *drhd;
3247         struct dmar_rmrr_unit *rmrr;
3248         bool copied_tables = false;
3249         struct device *dev;
3250         struct intel_iommu *iommu;
3251         int i, ret;
3252
3253         /*
3254          * for each drhd
3255          *    allocate root
3256          *    initialize and program root entry to not present
3257          * endfor
3258          */
3259         for_each_drhd_unit(drhd) {
3260                 /*
3261                  * lock not needed as this is only incremented in the single
3262                  * threaded kernel __init code path all other access are read
3263                  * only
3264                  */
3265                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3266                         g_num_of_iommus++;
3267                         continue;
3268                 }
3269                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3270         }
3271
3272         /* Preallocate enough resources for IOMMU hot-addition */
3273         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3274                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3275
3276         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3277                         GFP_KERNEL);
3278         if (!g_iommus) {
3279                 pr_err("Allocating global iommu array failed\n");
3280                 ret = -ENOMEM;
3281                 goto error;
3282         }
3283
3284         for_each_active_iommu(iommu, drhd) {
3285                 /*
3286                  * Find the max pasid size of all IOMMU's in the system.
3287                  * We need to ensure the system pasid table is no bigger
3288                  * than the smallest supported.
3289                  */
3290                 if (pasid_enabled(iommu)) {
3291                         u32 temp = 2 << ecap_pss(iommu->ecap);
3292
3293                         intel_pasid_max_id = min_t(u32, temp,
3294                                                    intel_pasid_max_id);
3295                 }
3296
3297                 g_iommus[iommu->seq_id] = iommu;
3298
3299                 intel_iommu_init_qi(iommu);
3300
3301                 ret = iommu_init_domains(iommu);
3302                 if (ret)
3303                         goto free_iommu;
3304
3305                 init_translation_status(iommu);
3306
3307                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3308                         iommu_disable_translation(iommu);
3309                         clear_translation_pre_enabled(iommu);
3310                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3311                                 iommu->name);
3312                 }
3313
3314                 /*
3315                  * TBD:
3316                  * we could share the same root & context tables
3317                  * among all IOMMU's. Need to Split it later.
3318                  */
3319                 ret = iommu_alloc_root_entry(iommu);
3320                 if (ret)
3321                         goto free_iommu;
3322
3323                 if (translation_pre_enabled(iommu)) {
3324                         pr_info("Translation already enabled - trying to copy translation structures\n");
3325
3326                         ret = copy_translation_tables(iommu);
3327                         if (ret) {
3328                                 /*
3329                                  * We found the IOMMU with translation
3330                                  * enabled - but failed to copy over the
3331                                  * old root-entry table. Try to proceed
3332                                  * by disabling translation now and
3333                                  * allocating a clean root-entry table.
3334                                  * This might cause DMAR faults, but
3335                                  * probably the dump will still succeed.
3336                                  */
3337                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3338                                        iommu->name);
3339                                 iommu_disable_translation(iommu);
3340                                 clear_translation_pre_enabled(iommu);
3341                         } else {
3342                                 pr_info("Copied translation tables from previous kernel for %s\n",
3343                                         iommu->name);
3344                                 copied_tables = true;
3345                         }
3346                 }
3347
3348                 if (!ecap_pass_through(iommu->ecap))
3349                         hw_pass_through = 0;
3350 #ifdef CONFIG_INTEL_IOMMU_SVM
3351                 if (pasid_enabled(iommu))
3352                         intel_svm_init(iommu);
3353 #endif
3354         }
3355
3356         /*
3357          * Now that qi is enabled on all iommus, set the root entry and flush
3358          * caches. This is required on some Intel X58 chipsets, otherwise the
3359          * flush_context function will loop forever and the boot hangs.
3360          */
3361         for_each_active_iommu(iommu, drhd) {
3362                 iommu_flush_write_buffer(iommu);
3363                 iommu_set_root_entry(iommu);
3364                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3365                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3366         }
3367
3368         if (iommu_pass_through)
3369                 iommu_identity_mapping |= IDENTMAP_ALL;
3370
3371 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3372         iommu_identity_mapping |= IDENTMAP_GFX;
3373 #endif
3374
3375         check_tylersburg_isoch();
3376
3377         if (iommu_identity_mapping) {
3378                 ret = si_domain_init(hw_pass_through);
3379                 if (ret)
3380                         goto free_iommu;
3381         }
3382
3383
3384         /*
3385          * If we copied translations from a previous kernel in the kdump
3386          * case, we can not assign the devices to domains now, as that
3387          * would eliminate the old mappings. So skip this part and defer
3388          * the assignment to device driver initialization time.
3389          */
3390         if (copied_tables)
3391                 goto domains_done;
3392
3393         /*
3394          * If pass through is not set or not enabled, setup context entries for
3395          * identity mappings for rmrr, gfx, and isa and may fall back to static
3396          * identity mapping if iommu_identity_mapping is set.
3397          */
3398         if (iommu_identity_mapping) {
3399                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3400                 if (ret) {
3401                         pr_crit("Failed to setup IOMMU pass-through\n");
3402                         goto free_iommu;
3403                 }
3404         }
3405         /*
3406          * For each rmrr
3407          *   for each dev attached to rmrr
3408          *   do
3409          *     locate drhd for dev, alloc domain for dev
3410          *     allocate free domain
3411          *     allocate page table entries for rmrr
3412          *     if context not allocated for bus
3413          *           allocate and init context
3414          *           set present in root table for this bus
3415          *     init context with domain, translation etc
3416          *    endfor
3417          * endfor
3418          */
3419         pr_info("Setting RMRR:\n");
3420         for_each_rmrr_units(rmrr) {
3421                 /* some BIOS lists non-exist devices in DMAR table. */
3422                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3423                                           i, dev) {
3424                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3425                         if (ret)
3426                                 pr_err("Mapping reserved region failed\n");
3427                 }
3428         }
3429
3430         iommu_prepare_isa();
3431
3432 domains_done:
3433
3434         /*
3435          * for each drhd
3436          *   enable fault log
3437          *   global invalidate context cache
3438          *   global invalidate iotlb
3439          *   enable translation
3440          */
3441         for_each_iommu(iommu, drhd) {
3442                 if (drhd->ignored) {
3443                         /*
3444                          * we always have to disable PMRs or DMA may fail on
3445                          * this device
3446                          */
3447                         if (force_on)
3448                                 iommu_disable_protect_mem_regions(iommu);
3449                         continue;
3450                 }
3451
3452                 iommu_flush_write_buffer(iommu);
3453
3454 #ifdef CONFIG_INTEL_IOMMU_SVM
3455                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3456                         ret = intel_svm_enable_prq(iommu);
3457                         if (ret)
3458                                 goto free_iommu;
3459                 }
3460 #endif
3461                 ret = dmar_set_interrupt(iommu);
3462                 if (ret)
3463                         goto free_iommu;
3464
3465                 if (!translation_pre_enabled(iommu))
3466                         iommu_enable_translation(iommu);
3467
3468                 iommu_disable_protect_mem_regions(iommu);
3469         }
3470
3471         return 0;
3472
3473 free_iommu:
3474         for_each_active_iommu(iommu, drhd) {
3475                 disable_dmar_iommu(iommu);
3476                 free_dmar_iommu(iommu);
3477         }
3478
3479         kfree(g_iommus);
3480
3481 error:
3482         return ret;
3483 }
3484
3485 /* This takes a number of _MM_ pages, not VTD pages */
3486 static unsigned long intel_alloc_iova(struct device *dev,
3487                                      struct dmar_domain *domain,
3488                                      unsigned long nrpages, uint64_t dma_mask)
3489 {
3490         unsigned long iova_pfn = 0;
3491
3492         /* Restrict dma_mask to the width that the iommu can handle */
3493         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3494         /* Ensure we reserve the whole size-aligned region */
3495         nrpages = __roundup_pow_of_two(nrpages);
3496
3497         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3498                 /*
3499                  * First try to allocate an io virtual address in
3500                  * DMA_BIT_MASK(32) and if that fails then try allocating
3501                  * from higher range
3502                  */
3503                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3504                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3505                 if (iova_pfn)
3506                         return iova_pfn;
3507         }
3508         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3509                                    IOVA_PFN(dma_mask), true);
3510         if (unlikely(!iova_pfn)) {
3511                 pr_err("Allocating %ld-page iova for %s failed",
3512                        nrpages, dev_name(dev));
3513                 return 0;
3514         }
3515
3516         return iova_pfn;
3517 }
3518
3519 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3520 {
3521         struct dmar_domain *domain, *tmp;
3522         struct dmar_rmrr_unit *rmrr;
3523         struct device *i_dev;
3524         int i, ret;
3525
3526         domain = find_domain(dev);
3527         if (domain)
3528                 goto out;
3529
3530         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3531         if (!domain)
3532                 goto out;
3533
3534         /* We have a new domain - setup possible RMRRs for the device */
3535         rcu_read_lock();
3536         for_each_rmrr_units(rmrr) {
3537                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3538                                           i, i_dev) {
3539                         if (i_dev != dev)
3540                                 continue;
3541
3542                         ret = domain_prepare_identity_map(dev, domain,
3543                                                           rmrr->base_address,
3544                                                           rmrr->end_address);
3545                         if (ret)
3546                                 dev_err(dev, "Mapping reserved region failed\n");
3547                 }
3548         }
3549         rcu_read_unlock();
3550
3551         tmp = set_domain_for_dev(dev, domain);
3552         if (!tmp || domain != tmp) {
3553                 domain_exit(domain);
3554                 domain = tmp;
3555         }
3556
3557 out:
3558
3559         if (!domain)
3560                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3561
3562
3563         return domain;
3564 }
3565
3566 /* Check if the dev needs to go through non-identity map and unmap process.*/
3567 static int iommu_no_mapping(struct device *dev)
3568 {
3569         int found;
3570
3571         if (iommu_dummy(dev))
3572                 return 1;
3573
3574         if (!iommu_identity_mapping)
3575                 return 0;
3576
3577         found = identity_mapping(dev);
3578         if (found) {
3579                 if (iommu_should_identity_map(dev, 0))
3580                         return 1;
3581                 else {
3582                         /*
3583                          * 32 bit DMA is removed from si_domain and fall back
3584                          * to non-identity mapping.
3585                          */
3586                         dmar_remove_one_dev_info(si_domain, dev);
3587                         pr_info("32bit %s uses non-identity mapping\n",
3588                                 dev_name(dev));
3589                         return 0;
3590                 }
3591         } else {
3592                 /*
3593                  * In case of a detached 64 bit DMA device from vm, the device
3594                  * is put into si_domain for identity mapping.
3595                  */
3596                 if (iommu_should_identity_map(dev, 0)) {
3597                         int ret;
3598                         ret = domain_add_dev_info(si_domain, dev);
3599                         if (!ret) {
3600                                 pr_info("64bit %s uses identity mapping\n",
3601                                         dev_name(dev));
3602                                 return 1;
3603                         }
3604                 }
3605         }
3606
3607         return 0;
3608 }
3609
3610 static dma_addr_t __intel_map_page(struct device *dev, struct page *page,
3611                                    unsigned long offset, size_t size, int dir,
3612                                    u64 dma_mask)
3613 {
3614         phys_addr_t paddr = page_to_phys(page) + offset;
3615         struct dmar_domain *domain;
3616         phys_addr_t start_paddr;
3617         unsigned long iova_pfn;
3618         int prot = 0;
3619         int ret;
3620         struct intel_iommu *iommu;
3621         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3622
3623         BUG_ON(dir == DMA_NONE);
3624
3625         if (iommu_no_mapping(dev))
3626                 return paddr;
3627
3628         domain = get_valid_domain_for_dev(dev);
3629         if (!domain)
3630                 return DMA_MAPPING_ERROR;
3631
3632         iommu = domain_get_iommu(domain);
3633         size = aligned_nrpages(paddr, size);
3634
3635         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3636         if (!iova_pfn)
3637                 goto error;
3638
3639         /*
3640          * Check if DMAR supports zero-length reads on write only
3641          * mappings..
3642          */
3643         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3644                         !cap_zlr(iommu->cap))
3645                 prot |= DMA_PTE_READ;
3646         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3647                 prot |= DMA_PTE_WRITE;
3648         /*
3649          * paddr - (paddr + size) might be partial page, we should map the whole
3650          * page.  Note: if two part of one page are separately mapped, we
3651          * might have two guest_addr mapping to the same host paddr, but this
3652          * is not a big problem
3653          */
3654         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3655                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3656         if (ret)
3657                 goto error;
3658
3659         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3660         start_paddr += paddr & ~PAGE_MASK;
3661         return start_paddr;
3662
3663 error:
3664         if (iova_pfn)
3665                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3666         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3667                 dev_name(dev), size, (unsigned long long)paddr, dir);
3668         return DMA_MAPPING_ERROR;
3669 }
3670
3671 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3672                                  unsigned long offset, size_t size,
3673                                  enum dma_data_direction dir,
3674                                  unsigned long attrs)
3675 {
3676         return __intel_map_page(dev, page, offset, size, dir, *dev->dma_mask);
3677 }
3678
3679 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3680 {
3681         struct dmar_domain *domain;
3682         unsigned long start_pfn, last_pfn;
3683         unsigned long nrpages;
3684         unsigned long iova_pfn;
3685         struct intel_iommu *iommu;
3686         struct page *freelist;
3687
3688         if (iommu_no_mapping(dev))
3689                 return;
3690
3691         domain = find_domain(dev);
3692         BUG_ON(!domain);
3693
3694         iommu = domain_get_iommu(domain);
3695
3696         iova_pfn = IOVA_PFN(dev_addr);
3697
3698         nrpages = aligned_nrpages(dev_addr, size);
3699         start_pfn = mm_to_dma_pfn(iova_pfn);
3700         last_pfn = start_pfn + nrpages - 1;
3701
3702         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3703                  dev_name(dev), start_pfn, last_pfn);
3704
3705         freelist = domain_unmap(domain, start_pfn, last_pfn);
3706
3707         if (intel_iommu_strict) {
3708                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3709                                       nrpages, !freelist, 0);
3710                 /* free iova */
3711                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3712                 dma_free_pagelist(freelist);
3713         } else {
3714                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3715                            (unsigned long)freelist);
3716                 /*
3717                  * queue up the release of the unmap to save the 1/6th of the
3718                  * cpu used up by the iotlb flush operation...
3719                  */
3720         }
3721 }
3722
3723 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3724                              size_t size, enum dma_data_direction dir,
3725                              unsigned long attrs)
3726 {
3727         intel_unmap(dev, dev_addr, size);
3728 }
3729
3730 static void *intel_alloc_coherent(struct device *dev, size_t size,
3731                                   dma_addr_t *dma_handle, gfp_t flags,
3732                                   unsigned long attrs)
3733 {
3734         struct page *page = NULL;
3735         int order;
3736
3737         size = PAGE_ALIGN(size);
3738         order = get_order(size);
3739
3740         if (!iommu_no_mapping(dev))
3741                 flags &= ~(GFP_DMA | GFP_DMA32);
3742         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3743                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3744                         flags |= GFP_DMA;
3745                 else
3746                         flags |= GFP_DMA32;
3747         }
3748
3749         if (gfpflags_allow_blocking(flags)) {
3750                 unsigned int count = size >> PAGE_SHIFT;
3751
3752                 page = dma_alloc_from_contiguous(dev, count, order,
3753                                                  flags & __GFP_NOWARN);
3754                 if (page && iommu_no_mapping(dev) &&
3755                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3756                         dma_release_from_contiguous(dev, page, count);
3757                         page = NULL;
3758                 }
3759         }
3760
3761         if (!page)
3762                 page = alloc_pages(flags, order);
3763         if (!page)
3764                 return NULL;
3765         memset(page_address(page), 0, size);
3766
3767         *dma_handle = __intel_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL,
3768                                        dev->coherent_dma_mask);
3769         if (*dma_handle != DMA_MAPPING_ERROR)
3770                 return page_address(page);
3771         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3772                 __free_pages(page, order);
3773
3774         return NULL;
3775 }
3776
3777 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3778                                 dma_addr_t dma_handle, unsigned long attrs)
3779 {
3780         int order;
3781         struct page *page = virt_to_page(vaddr);
3782
3783         size = PAGE_ALIGN(size);
3784         order = get_order(size);
3785
3786         intel_unmap(dev, dma_handle, size);
3787         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3788                 __free_pages(page, order);
3789 }
3790
3791 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3792                            int nelems, enum dma_data_direction dir,
3793                            unsigned long attrs)
3794 {
3795         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3796         unsigned long nrpages = 0;
3797         struct scatterlist *sg;
3798         int i;
3799
3800         for_each_sg(sglist, sg, nelems, i) {
3801                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3802         }
3803
3804         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3805 }
3806
3807 static int intel_nontranslate_map_sg(struct device *hddev,
3808         struct scatterlist *sglist, int nelems, int dir)
3809 {
3810         int i;
3811         struct scatterlist *sg;
3812
3813         for_each_sg(sglist, sg, nelems, i) {
3814                 BUG_ON(!sg_page(sg));
3815                 sg->dma_address = sg_phys(sg);
3816                 sg->dma_length = sg->length;
3817         }
3818         return nelems;
3819 }
3820
3821 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3822                         enum dma_data_direction dir, unsigned long attrs)
3823 {
3824         int i;
3825         struct dmar_domain *domain;
3826         size_t size = 0;
3827         int prot = 0;
3828         unsigned long iova_pfn;
3829         int ret;
3830         struct scatterlist *sg;
3831         unsigned long start_vpfn;
3832         struct intel_iommu *iommu;
3833
3834         BUG_ON(dir == DMA_NONE);
3835         if (iommu_no_mapping(dev))
3836                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3837
3838         domain = get_valid_domain_for_dev(dev);
3839         if (!domain)
3840                 return 0;
3841
3842         iommu = domain_get_iommu(domain);
3843
3844         for_each_sg(sglist, sg, nelems, i)
3845                 size += aligned_nrpages(sg->offset, sg->length);
3846
3847         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3848                                 *dev->dma_mask);
3849         if (!iova_pfn) {
3850                 sglist->dma_length = 0;
3851                 return 0;
3852         }
3853
3854         /*
3855          * Check if DMAR supports zero-length reads on write only
3856          * mappings..
3857          */
3858         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3859                         !cap_zlr(iommu->cap))
3860                 prot |= DMA_PTE_READ;
3861         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3862                 prot |= DMA_PTE_WRITE;
3863
3864         start_vpfn = mm_to_dma_pfn(iova_pfn);
3865
3866         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3867         if (unlikely(ret)) {
3868                 dma_pte_free_pagetable(domain, start_vpfn,
3869                                        start_vpfn + size - 1,
3870                                        agaw_to_level(domain->agaw) + 1);
3871                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3872                 return 0;
3873         }
3874
3875         return nelems;
3876 }
3877
3878 static const struct dma_map_ops intel_dma_ops = {
3879         .alloc = intel_alloc_coherent,
3880         .free = intel_free_coherent,
3881         .map_sg = intel_map_sg,
3882         .unmap_sg = intel_unmap_sg,
3883         .map_page = intel_map_page,
3884         .unmap_page = intel_unmap_page,
3885         .dma_supported = dma_direct_supported,
3886 };
3887
3888 static inline int iommu_domain_cache_init(void)
3889 {
3890         int ret = 0;
3891
3892         iommu_domain_cache = kmem_cache_create("iommu_domain",
3893                                          sizeof(struct dmar_domain),
3894                                          0,
3895                                          SLAB_HWCACHE_ALIGN,
3896
3897                                          NULL);
3898         if (!iommu_domain_cache) {
3899                 pr_err("Couldn't create iommu_domain cache\n");
3900                 ret = -ENOMEM;
3901         }
3902
3903         return ret;
3904 }
3905
3906 static inline int iommu_devinfo_cache_init(void)
3907 {
3908         int ret = 0;
3909
3910         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3911                                          sizeof(struct device_domain_info),
3912                                          0,
3913                                          SLAB_HWCACHE_ALIGN,
3914                                          NULL);
3915         if (!iommu_devinfo_cache) {
3916                 pr_err("Couldn't create devinfo cache\n");
3917                 ret = -ENOMEM;
3918         }
3919
3920         return ret;
3921 }
3922
3923 static int __init iommu_init_mempool(void)
3924 {
3925         int ret;
3926         ret = iova_cache_get();
3927         if (ret)
3928                 return ret;
3929
3930         ret = iommu_domain_cache_init();
3931         if (ret)
3932                 goto domain_error;
3933
3934         ret = iommu_devinfo_cache_init();
3935         if (!ret)
3936                 return ret;
3937
3938         kmem_cache_destroy(iommu_domain_cache);
3939 domain_error:
3940         iova_cache_put();
3941
3942         return -ENOMEM;
3943 }
3944
3945 static void __init iommu_exit_mempool(void)
3946 {
3947         kmem_cache_destroy(iommu_devinfo_cache);
3948         kmem_cache_destroy(iommu_domain_cache);
3949         iova_cache_put();
3950 }
3951
3952 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3953 {
3954         struct dmar_drhd_unit *drhd;
3955         u32 vtbar;
3956         int rc;
3957
3958         /* We know that this device on this chipset has its own IOMMU.
3959          * If we find it under a different IOMMU, then the BIOS is lying
3960          * to us. Hope that the IOMMU for this device is actually
3961          * disabled, and it needs no translation...
3962          */
3963         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3964         if (rc) {
3965                 /* "can't" happen */
3966                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3967                 return;
3968         }
3969         vtbar &= 0xffff0000;
3970
3971         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3972         drhd = dmar_find_matched_drhd_unit(pdev);
3973         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3974                             TAINT_FIRMWARE_WORKAROUND,
3975                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3976                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3977 }
3978 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3979
3980 static void __init init_no_remapping_devices(void)
3981 {
3982         struct dmar_drhd_unit *drhd;
3983         struct device *dev;
3984         int i;
3985
3986         for_each_drhd_unit(drhd) {
3987                 if (!drhd->include_all) {
3988                         for_each_active_dev_scope(drhd->devices,
3989                                                   drhd->devices_cnt, i, dev)
3990                                 break;
3991                         /* ignore DMAR unit if no devices exist */
3992                         if (i == drhd->devices_cnt)
3993                                 drhd->ignored = 1;
3994                 }
3995         }
3996
3997         for_each_active_drhd_unit(drhd) {
3998                 if (drhd->include_all)
3999                         continue;
4000
4001                 for_each_active_dev_scope(drhd->devices,
4002                                           drhd->devices_cnt, i, dev)
4003                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4004                                 break;
4005                 if (i < drhd->devices_cnt)
4006                         continue;
4007
4008                 /* This IOMMU has *only* gfx devices. Either bypass it or
4009                    set the gfx_mapped flag, as appropriate */
4010                 if (dmar_map_gfx) {
4011                         intel_iommu_gfx_mapped = 1;
4012                 } else {
4013                         drhd->ignored = 1;
4014                         for_each_active_dev_scope(drhd->devices,
4015                                                   drhd->devices_cnt, i, dev)
4016                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4017                 }
4018         }
4019 }
4020
4021 #ifdef CONFIG_SUSPEND
4022 static int init_iommu_hw(void)
4023 {
4024         struct dmar_drhd_unit *drhd;
4025         struct intel_iommu *iommu = NULL;
4026
4027         for_each_active_iommu(iommu, drhd)
4028                 if (iommu->qi)
4029                         dmar_reenable_qi(iommu);
4030
4031         for_each_iommu(iommu, drhd) {
4032                 if (drhd->ignored) {
4033                         /*
4034                          * we always have to disable PMRs or DMA may fail on
4035                          * this device
4036                          */
4037                         if (force_on)
4038                                 iommu_disable_protect_mem_regions(iommu);
4039                         continue;
4040                 }
4041         
4042                 iommu_flush_write_buffer(iommu);
4043
4044                 iommu_set_root_entry(iommu);
4045
4046                 iommu->flush.flush_context(iommu, 0, 0, 0,
4047                                            DMA_CCMD_GLOBAL_INVL);
4048                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4049                 iommu_enable_translation(iommu);
4050                 iommu_disable_protect_mem_regions(iommu);
4051         }
4052
4053         return 0;
4054 }
4055
4056 static void iommu_flush_all(void)
4057 {
4058         struct dmar_drhd_unit *drhd;
4059         struct intel_iommu *iommu;
4060
4061         for_each_active_iommu(iommu, drhd) {
4062                 iommu->flush.flush_context(iommu, 0, 0, 0,
4063                                            DMA_CCMD_GLOBAL_INVL);
4064                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4065                                          DMA_TLB_GLOBAL_FLUSH);
4066         }
4067 }
4068
4069 static int iommu_suspend(void)
4070 {
4071         struct dmar_drhd_unit *drhd;
4072         struct intel_iommu *iommu = NULL;
4073         unsigned long flag;
4074
4075         for_each_active_iommu(iommu, drhd) {
4076                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4077                                                  GFP_ATOMIC);
4078                 if (!iommu->iommu_state)
4079                         goto nomem;
4080         }
4081
4082         iommu_flush_all();
4083
4084         for_each_active_iommu(iommu, drhd) {
4085                 iommu_disable_translation(iommu);
4086
4087                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4088
4089                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4090                         readl(iommu->reg + DMAR_FECTL_REG);
4091                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4092                         readl(iommu->reg + DMAR_FEDATA_REG);
4093                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4094                         readl(iommu->reg + DMAR_FEADDR_REG);
4095                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4096                         readl(iommu->reg + DMAR_FEUADDR_REG);
4097
4098                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4099         }
4100         return 0;
4101
4102 nomem:
4103         for_each_active_iommu(iommu, drhd)
4104                 kfree(iommu->iommu_state);
4105
4106         return -ENOMEM;
4107 }
4108
4109 static void iommu_resume(void)
4110 {
4111         struct dmar_drhd_unit *drhd;
4112         struct intel_iommu *iommu = NULL;
4113         unsigned long flag;
4114
4115         if (init_iommu_hw()) {
4116                 if (force_on)
4117                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4118                 else
4119                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4120                 return;
4121         }
4122
4123         for_each_active_iommu(iommu, drhd) {
4124
4125                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4126
4127                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4128                         iommu->reg + DMAR_FECTL_REG);
4129                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4130                         iommu->reg + DMAR_FEDATA_REG);
4131                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4132                         iommu->reg + DMAR_FEADDR_REG);
4133                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4134                         iommu->reg + DMAR_FEUADDR_REG);
4135
4136                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4137         }
4138
4139         for_each_active_iommu(iommu, drhd)
4140                 kfree(iommu->iommu_state);
4141 }
4142
4143 static struct syscore_ops iommu_syscore_ops = {
4144         .resume         = iommu_resume,
4145         .suspend        = iommu_suspend,
4146 };
4147
4148 static void __init init_iommu_pm_ops(void)
4149 {
4150         register_syscore_ops(&iommu_syscore_ops);
4151 }
4152
4153 #else
4154 static inline void init_iommu_pm_ops(void) {}
4155 #endif  /* CONFIG_PM */
4156
4157
4158 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4159 {
4160         struct acpi_dmar_reserved_memory *rmrr;
4161         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4162         struct dmar_rmrr_unit *rmrru;
4163         size_t length;
4164
4165         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4166         if (!rmrru)
4167                 goto out;
4168
4169         rmrru->hdr = header;
4170         rmrr = (struct acpi_dmar_reserved_memory *)header;
4171         rmrru->base_address = rmrr->base_address;
4172         rmrru->end_address = rmrr->end_address;
4173
4174         length = rmrr->end_address - rmrr->base_address + 1;
4175         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4176                                               IOMMU_RESV_DIRECT);
4177         if (!rmrru->resv)
4178                 goto free_rmrru;
4179
4180         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4181                                 ((void *)rmrr) + rmrr->header.length,
4182                                 &rmrru->devices_cnt);
4183         if (rmrru->devices_cnt && rmrru->devices == NULL)
4184                 goto free_all;
4185
4186         list_add(&rmrru->list, &dmar_rmrr_units);
4187
4188         return 0;
4189 free_all:
4190         kfree(rmrru->resv);
4191 free_rmrru:
4192         kfree(rmrru);
4193 out:
4194         return -ENOMEM;
4195 }
4196
4197 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4198 {
4199         struct dmar_atsr_unit *atsru;
4200         struct acpi_dmar_atsr *tmp;
4201
4202         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4203                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4204                 if (atsr->segment != tmp->segment)
4205                         continue;
4206                 if (atsr->header.length != tmp->header.length)
4207                         continue;
4208                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4209                         return atsru;
4210         }
4211
4212         return NULL;
4213 }
4214
4215 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4216 {
4217         struct acpi_dmar_atsr *atsr;
4218         struct dmar_atsr_unit *atsru;
4219
4220         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4221                 return 0;
4222
4223         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4224         atsru = dmar_find_atsr(atsr);
4225         if (atsru)
4226                 return 0;
4227
4228         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4229         if (!atsru)
4230                 return -ENOMEM;
4231
4232         /*
4233          * If memory is allocated from slab by ACPI _DSM method, we need to
4234          * copy the memory content because the memory buffer will be freed
4235          * on return.
4236          */
4237         atsru->hdr = (void *)(atsru + 1);
4238         memcpy(atsru->hdr, hdr, hdr->length);
4239         atsru->include_all = atsr->flags & 0x1;
4240         if (!atsru->include_all) {
4241                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4242                                 (void *)atsr + atsr->header.length,
4243                                 &atsru->devices_cnt);
4244                 if (atsru->devices_cnt && atsru->devices == NULL) {
4245                         kfree(atsru);
4246                         return -ENOMEM;
4247                 }
4248         }
4249
4250         list_add_rcu(&atsru->list, &dmar_atsr_units);
4251
4252         return 0;
4253 }
4254
4255 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4256 {
4257         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4258         kfree(atsru);
4259 }
4260
4261 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4262 {
4263         struct acpi_dmar_atsr *atsr;
4264         struct dmar_atsr_unit *atsru;
4265
4266         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4267         atsru = dmar_find_atsr(atsr);
4268         if (atsru) {
4269                 list_del_rcu(&atsru->list);
4270                 synchronize_rcu();
4271                 intel_iommu_free_atsr(atsru);
4272         }
4273
4274         return 0;
4275 }
4276
4277 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4278 {
4279         int i;
4280         struct device *dev;
4281         struct acpi_dmar_atsr *atsr;
4282         struct dmar_atsr_unit *atsru;
4283
4284         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4285         atsru = dmar_find_atsr(atsr);
4286         if (!atsru)
4287                 return 0;
4288
4289         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4290                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4291                                           i, dev)
4292                         return -EBUSY;
4293         }
4294
4295         return 0;
4296 }
4297
4298 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4299 {
4300         int sp, ret = 0;
4301         struct intel_iommu *iommu = dmaru->iommu;
4302
4303         if (g_iommus[iommu->seq_id])
4304                 return 0;
4305
4306         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4307                 pr_warn("%s: Doesn't support hardware pass through.\n",
4308                         iommu->name);
4309                 return -ENXIO;
4310         }
4311         if (!ecap_sc_support(iommu->ecap) &&
4312             domain_update_iommu_snooping(iommu)) {
4313                 pr_warn("%s: Doesn't support snooping.\n",
4314                         iommu->name);
4315                 return -ENXIO;
4316         }
4317         sp = domain_update_iommu_superpage(iommu) - 1;
4318         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4319                 pr_warn("%s: Doesn't support large page.\n",
4320                         iommu->name);
4321                 return -ENXIO;
4322         }
4323
4324         /*
4325          * Disable translation if already enabled prior to OS handover.
4326          */
4327         if (iommu->gcmd & DMA_GCMD_TE)
4328                 iommu_disable_translation(iommu);
4329
4330         g_iommus[iommu->seq_id] = iommu;
4331         ret = iommu_init_domains(iommu);
4332         if (ret == 0)
4333                 ret = iommu_alloc_root_entry(iommu);
4334         if (ret)
4335                 goto out;
4336
4337 #ifdef CONFIG_INTEL_IOMMU_SVM
4338         if (pasid_enabled(iommu))
4339                 intel_svm_init(iommu);
4340 #endif
4341
4342         if (dmaru->ignored) {
4343                 /*
4344                  * we always have to disable PMRs or DMA may fail on this device
4345                  */
4346                 if (force_on)
4347                         iommu_disable_protect_mem_regions(iommu);
4348                 return 0;
4349         }
4350
4351         intel_iommu_init_qi(iommu);
4352         iommu_flush_write_buffer(iommu);
4353
4354 #ifdef CONFIG_INTEL_IOMMU_SVM
4355         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4356                 ret = intel_svm_enable_prq(iommu);
4357                 if (ret)
4358                         goto disable_iommu;
4359         }
4360 #endif
4361         ret = dmar_set_interrupt(iommu);
4362         if (ret)
4363                 goto disable_iommu;
4364
4365         iommu_set_root_entry(iommu);
4366         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4367         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4368         iommu_enable_translation(iommu);
4369
4370         iommu_disable_protect_mem_regions(iommu);
4371         return 0;
4372
4373 disable_iommu:
4374         disable_dmar_iommu(iommu);
4375 out:
4376         free_dmar_iommu(iommu);
4377         return ret;
4378 }
4379
4380 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4381 {
4382         int ret = 0;
4383         struct intel_iommu *iommu = dmaru->iommu;
4384
4385         if (!intel_iommu_enabled)
4386                 return 0;
4387         if (iommu == NULL)
4388                 return -EINVAL;
4389
4390         if (insert) {
4391                 ret = intel_iommu_add(dmaru);
4392         } else {
4393                 disable_dmar_iommu(iommu);
4394                 free_dmar_iommu(iommu);
4395         }
4396
4397         return ret;
4398 }
4399
4400 static void intel_iommu_free_dmars(void)
4401 {
4402         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4403         struct dmar_atsr_unit *atsru, *atsr_n;
4404
4405         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4406                 list_del(&rmrru->list);
4407                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4408                 kfree(rmrru->resv);
4409                 kfree(rmrru);
4410         }
4411
4412         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4413                 list_del(&atsru->list);
4414                 intel_iommu_free_atsr(atsru);
4415         }
4416 }
4417
4418 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4419 {
4420         int i, ret = 1;
4421         struct pci_bus *bus;
4422         struct pci_dev *bridge = NULL;
4423         struct device *tmp;
4424         struct acpi_dmar_atsr *atsr;
4425         struct dmar_atsr_unit *atsru;
4426
4427         dev = pci_physfn(dev);
4428         for (bus = dev->bus; bus; bus = bus->parent) {
4429                 bridge = bus->self;
4430                 /* If it's an integrated device, allow ATS */
4431                 if (!bridge)
4432                         return 1;
4433                 /* Connected via non-PCIe: no ATS */
4434                 if (!pci_is_pcie(bridge) ||
4435                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4436                         return 0;
4437                 /* If we found the root port, look it up in the ATSR */
4438                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4439                         break;
4440         }
4441
4442         rcu_read_lock();
4443         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4444                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4445                 if (atsr->segment != pci_domain_nr(dev->bus))
4446                         continue;
4447
4448                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4449                         if (tmp == &bridge->dev)
4450                                 goto out;
4451
4452                 if (atsru->include_all)
4453                         goto out;
4454         }
4455         ret = 0;
4456 out:
4457         rcu_read_unlock();
4458
4459         return ret;
4460 }
4461
4462 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4463 {
4464         int ret = 0;
4465         struct dmar_rmrr_unit *rmrru;
4466         struct dmar_atsr_unit *atsru;
4467         struct acpi_dmar_atsr *atsr;
4468         struct acpi_dmar_reserved_memory *rmrr;
4469
4470         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4471                 return 0;
4472
4473         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4474                 rmrr = container_of(rmrru->hdr,
4475                                     struct acpi_dmar_reserved_memory, header);
4476                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4477                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4478                                 ((void *)rmrr) + rmrr->header.length,
4479                                 rmrr->segment, rmrru->devices,
4480                                 rmrru->devices_cnt);
4481                         if(ret < 0)
4482                                 return ret;
4483                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4484                         dmar_remove_dev_scope(info, rmrr->segment,
4485                                 rmrru->devices, rmrru->devices_cnt);
4486                 }
4487         }
4488
4489         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4490                 if (atsru->include_all)
4491                         continue;
4492
4493                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4494                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4495                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4496                                         (void *)atsr + atsr->header.length,
4497                                         atsr->segment, atsru->devices,
4498                                         atsru->devices_cnt);
4499                         if (ret > 0)
4500                                 break;
4501                         else if(ret < 0)
4502                                 return ret;
4503                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4504                         if (dmar_remove_dev_scope(info, atsr->segment,
4505                                         atsru->devices, atsru->devices_cnt))
4506                                 break;
4507                 }
4508         }
4509
4510         return 0;
4511 }
4512
4513 /*
4514  * Here we only respond to action of unbound device from driver.
4515  *
4516  * Added device is not attached to its DMAR domain here yet. That will happen
4517  * when mapping the device to iova.
4518  */
4519 static int device_notifier(struct notifier_block *nb,
4520                                   unsigned long action, void *data)
4521 {
4522         struct device *dev = data;
4523         struct dmar_domain *domain;
4524
4525         if (iommu_dummy(dev))
4526                 return 0;
4527
4528         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4529                 return 0;
4530
4531         domain = find_domain(dev);
4532         if (!domain)
4533                 return 0;
4534
4535         dmar_remove_one_dev_info(domain, dev);
4536         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4537                 domain_exit(domain);
4538
4539         return 0;
4540 }
4541
4542 static struct notifier_block device_nb = {
4543         .notifier_call = device_notifier,
4544 };
4545
4546 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4547                                        unsigned long val, void *v)
4548 {
4549         struct memory_notify *mhp = v;
4550         unsigned long long start, end;
4551         unsigned long start_vpfn, last_vpfn;
4552
4553         switch (val) {
4554         case MEM_GOING_ONLINE:
4555                 start = mhp->start_pfn << PAGE_SHIFT;
4556                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4557                 if (iommu_domain_identity_map(si_domain, start, end)) {
4558                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4559                                 start, end);
4560                         return NOTIFY_BAD;
4561                 }
4562                 break;
4563
4564         case MEM_OFFLINE:
4565         case MEM_CANCEL_ONLINE:
4566                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4568                 while (start_vpfn <= last_vpfn) {
4569                         struct iova *iova;
4570                         struct dmar_drhd_unit *drhd;
4571                         struct intel_iommu *iommu;
4572                         struct page *freelist;
4573
4574                         iova = find_iova(&si_domain->iovad, start_vpfn);
4575                         if (iova == NULL) {
4576                                 pr_debug("Failed get IOVA for PFN %lx\n",
4577                                          start_vpfn);
4578                                 break;
4579                         }
4580
4581                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4582                                                      start_vpfn, last_vpfn);
4583                         if (iova == NULL) {
4584                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4585                                         start_vpfn, last_vpfn);
4586                                 return NOTIFY_BAD;
4587                         }
4588
4589                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4590                                                iova->pfn_hi);
4591
4592                         rcu_read_lock();
4593                         for_each_active_iommu(iommu, drhd)
4594                                 iommu_flush_iotlb_psi(iommu, si_domain,
4595                                         iova->pfn_lo, iova_size(iova),
4596                                         !freelist, 0);
4597                         rcu_read_unlock();
4598                         dma_free_pagelist(freelist);
4599
4600                         start_vpfn = iova->pfn_hi + 1;
4601                         free_iova_mem(iova);
4602                 }
4603                 break;
4604         }
4605
4606         return NOTIFY_OK;
4607 }
4608
4609 static struct notifier_block intel_iommu_memory_nb = {
4610         .notifier_call = intel_iommu_memory_notifier,
4611         .priority = 0
4612 };
4613
4614 static void free_all_cpu_cached_iovas(unsigned int cpu)
4615 {
4616         int i;
4617
4618         for (i = 0; i < g_num_of_iommus; i++) {
4619                 struct intel_iommu *iommu = g_iommus[i];
4620                 struct dmar_domain *domain;
4621                 int did;
4622
4623                 if (!iommu)
4624                         continue;
4625
4626                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4627                         domain = get_iommu_domain(iommu, (u16)did);
4628
4629                         if (!domain)
4630                                 continue;
4631                         free_cpu_cached_iovas(cpu, &domain->iovad);
4632                 }
4633         }
4634 }
4635
4636 static int intel_iommu_cpu_dead(unsigned int cpu)
4637 {
4638         free_all_cpu_cached_iovas(cpu);
4639         return 0;
4640 }
4641
4642 static void intel_disable_iommus(void)
4643 {
4644         struct intel_iommu *iommu = NULL;
4645         struct dmar_drhd_unit *drhd;
4646
4647         for_each_iommu(iommu, drhd)
4648                 iommu_disable_translation(iommu);
4649 }
4650
4651 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4652 {
4653         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4654
4655         return container_of(iommu_dev, struct intel_iommu, iommu);
4656 }
4657
4658 static ssize_t intel_iommu_show_version(struct device *dev,
4659                                         struct device_attribute *attr,
4660                                         char *buf)
4661 {
4662         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4663         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4664         return sprintf(buf, "%d:%d\n",
4665                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4666 }
4667 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4668
4669 static ssize_t intel_iommu_show_address(struct device *dev,
4670                                         struct device_attribute *attr,
4671                                         char *buf)
4672 {
4673         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4674         return sprintf(buf, "%llx\n", iommu->reg_phys);
4675 }
4676 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4677
4678 static ssize_t intel_iommu_show_cap(struct device *dev,
4679                                     struct device_attribute *attr,
4680                                     char *buf)
4681 {
4682         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4683         return sprintf(buf, "%llx\n", iommu->cap);
4684 }
4685 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4686
4687 static ssize_t intel_iommu_show_ecap(struct device *dev,
4688                                     struct device_attribute *attr,
4689                                     char *buf)
4690 {
4691         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4692         return sprintf(buf, "%llx\n", iommu->ecap);
4693 }
4694 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4695
4696 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4697                                       struct device_attribute *attr,
4698                                       char *buf)
4699 {
4700         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4701         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4702 }
4703 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4704
4705 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4706                                            struct device_attribute *attr,
4707                                            char *buf)
4708 {
4709         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4710         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4711                                                   cap_ndoms(iommu->cap)));
4712 }
4713 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4714
4715 static struct attribute *intel_iommu_attrs[] = {
4716         &dev_attr_version.attr,
4717         &dev_attr_address.attr,
4718         &dev_attr_cap.attr,
4719         &dev_attr_ecap.attr,
4720         &dev_attr_domains_supported.attr,
4721         &dev_attr_domains_used.attr,
4722         NULL,
4723 };
4724
4725 static struct attribute_group intel_iommu_group = {
4726         .name = "intel-iommu",
4727         .attrs = intel_iommu_attrs,
4728 };
4729
4730 const struct attribute_group *intel_iommu_groups[] = {
4731         &intel_iommu_group,
4732         NULL,
4733 };
4734
4735 static int __init platform_optin_force_iommu(void)
4736 {
4737         struct pci_dev *pdev = NULL;
4738         bool has_untrusted_dev = false;
4739
4740         if (!dmar_platform_optin() || no_platform_optin)
4741                 return 0;
4742
4743         for_each_pci_dev(pdev) {
4744                 if (pdev->untrusted) {
4745                         has_untrusted_dev = true;
4746                         break;
4747                 }
4748         }
4749
4750         if (!has_untrusted_dev)
4751                 return 0;
4752
4753         if (no_iommu || dmar_disabled)
4754                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4755
4756         /*
4757          * If Intel-IOMMU is disabled by default, we will apply identity
4758          * map for all devices except those marked as being untrusted.
4759          */
4760         if (dmar_disabled)
4761                 iommu_identity_mapping |= IDENTMAP_ALL;
4762
4763         dmar_disabled = 0;
4764 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4765         swiotlb = 0;
4766 #endif
4767         no_iommu = 0;
4768
4769         return 1;
4770 }
4771
4772 int __init intel_iommu_init(void)
4773 {
4774         int ret = -ENODEV;
4775         struct dmar_drhd_unit *drhd;
4776         struct intel_iommu *iommu;
4777
4778         /*
4779          * Intel IOMMU is required for a TXT/tboot launch or platform
4780          * opt in, so enforce that.
4781          */
4782         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4783
4784         if (iommu_init_mempool()) {
4785                 if (force_on)
4786                         panic("tboot: Failed to initialize iommu memory\n");
4787                 return -ENOMEM;
4788         }
4789
4790         down_write(&dmar_global_lock);
4791         if (dmar_table_init()) {
4792                 if (force_on)
4793                         panic("tboot: Failed to initialize DMAR table\n");
4794                 goto out_free_dmar;
4795         }
4796
4797         if (dmar_dev_scope_init() < 0) {
4798                 if (force_on)
4799                         panic("tboot: Failed to initialize DMAR device scope\n");
4800                 goto out_free_dmar;
4801         }
4802
4803         up_write(&dmar_global_lock);
4804
4805         /*
4806          * The bus notifier takes the dmar_global_lock, so lockdep will
4807          * complain later when we register it under the lock.
4808          */
4809         dmar_register_bus_notifier();
4810
4811         down_write(&dmar_global_lock);
4812
4813         if (no_iommu || dmar_disabled) {
4814                 /*
4815                  * We exit the function here to ensure IOMMU's remapping and
4816                  * mempool aren't setup, which means that the IOMMU's PMRs
4817                  * won't be disabled via the call to init_dmars(). So disable
4818                  * it explicitly here. The PMRs were setup by tboot prior to
4819                  * calling SENTER, but the kernel is expected to reset/tear
4820                  * down the PMRs.
4821                  */
4822                 if (intel_iommu_tboot_noforce) {
4823                         for_each_iommu(iommu, drhd)
4824                                 iommu_disable_protect_mem_regions(iommu);
4825                 }
4826
4827                 /*
4828                  * Make sure the IOMMUs are switched off, even when we
4829                  * boot into a kexec kernel and the previous kernel left
4830                  * them enabled
4831                  */
4832                 intel_disable_iommus();
4833                 goto out_free_dmar;
4834         }
4835
4836         if (list_empty(&dmar_rmrr_units))
4837                 pr_info("No RMRR found\n");
4838
4839         if (list_empty(&dmar_atsr_units))
4840                 pr_info("No ATSR found\n");
4841
4842         if (dmar_init_reserved_ranges()) {
4843                 if (force_on)
4844                         panic("tboot: Failed to reserve iommu ranges\n");
4845                 goto out_free_reserved_range;
4846         }
4847
4848         init_no_remapping_devices();
4849
4850         ret = init_dmars();
4851         if (ret) {
4852                 if (force_on)
4853                         panic("tboot: Failed to initialize DMARs\n");
4854                 pr_err("Initialization failed\n");
4855                 goto out_free_reserved_range;
4856         }
4857         up_write(&dmar_global_lock);
4858         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4859
4860 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4861         swiotlb = 0;
4862 #endif
4863         dma_ops = &intel_dma_ops;
4864
4865         init_iommu_pm_ops();
4866
4867         for_each_active_iommu(iommu, drhd) {
4868                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4869                                        intel_iommu_groups,
4870                                        "%s", iommu->name);
4871                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4872                 iommu_device_register(&iommu->iommu);
4873         }
4874
4875         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4876         bus_register_notifier(&pci_bus_type, &device_nb);
4877         if (si_domain && !hw_pass_through)
4878                 register_memory_notifier(&intel_iommu_memory_nb);
4879         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4880                           intel_iommu_cpu_dead);
4881         intel_iommu_enabled = 1;
4882         intel_iommu_debugfs_init();
4883
4884         return 0;
4885
4886 out_free_reserved_range:
4887         put_iova_domain(&reserved_iova_list);
4888 out_free_dmar:
4889         intel_iommu_free_dmars();
4890         up_write(&dmar_global_lock);
4891         iommu_exit_mempool();
4892         return ret;
4893 }
4894
4895 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4896 {
4897         struct intel_iommu *iommu = opaque;
4898
4899         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4900         return 0;
4901 }
4902
4903 /*
4904  * NB - intel-iommu lacks any sort of reference counting for the users of
4905  * dependent devices.  If multiple endpoints have intersecting dependent
4906  * devices, unbinding the driver from any one of them will possibly leave
4907  * the others unable to operate.
4908  */
4909 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4910 {
4911         if (!iommu || !dev || !dev_is_pci(dev))
4912                 return;
4913
4914         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4915 }
4916
4917 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4918 {
4919         struct intel_iommu *iommu;
4920         unsigned long flags;
4921
4922         assert_spin_locked(&device_domain_lock);
4923
4924         if (WARN_ON(!info))
4925                 return;
4926
4927         iommu = info->iommu;
4928
4929         if (info->dev) {
4930                 iommu_disable_dev_iotlb(info);
4931                 domain_context_clear(iommu, info->dev);
4932                 intel_pasid_free_table(info->dev);
4933         }
4934
4935         unlink_domain_info(info);
4936
4937         spin_lock_irqsave(&iommu->lock, flags);
4938         domain_detach_iommu(info->domain, iommu);
4939         spin_unlock_irqrestore(&iommu->lock, flags);
4940
4941         free_devinfo_mem(info);
4942 }
4943
4944 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4945                                      struct device *dev)
4946 {
4947         struct device_domain_info *info;
4948         unsigned long flags;
4949
4950         spin_lock_irqsave(&device_domain_lock, flags);
4951         info = dev->archdata.iommu;
4952         __dmar_remove_one_dev_info(info);
4953         spin_unlock_irqrestore(&device_domain_lock, flags);
4954 }
4955
4956 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4957 {
4958         int adjust_width;
4959
4960         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4961         domain_reserve_special_ranges(domain);
4962
4963         /* calculate AGAW */
4964         domain->gaw = guest_width;
4965         adjust_width = guestwidth_to_adjustwidth(guest_width);
4966         domain->agaw = width_to_agaw(adjust_width);
4967
4968         domain->iommu_coherency = 0;
4969         domain->iommu_snooping = 0;
4970         domain->iommu_superpage = 0;
4971         domain->max_addr = 0;
4972
4973         /* always allocate the top pgd */
4974         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4975         if (!domain->pgd)
4976                 return -ENOMEM;
4977         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4978         return 0;
4979 }
4980
4981 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4982 {
4983         struct dmar_domain *dmar_domain;
4984         struct iommu_domain *domain;
4985
4986         if (type != IOMMU_DOMAIN_UNMANAGED)
4987                 return NULL;
4988
4989         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4990         if (!dmar_domain) {
4991                 pr_err("Can't allocate dmar_domain\n");
4992                 return NULL;
4993         }
4994         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4995                 pr_err("Domain initialization failed\n");
4996                 domain_exit(dmar_domain);
4997                 return NULL;
4998         }
4999         domain_update_iommu_cap(dmar_domain);
5000
5001         domain = &dmar_domain->domain;
5002         domain->geometry.aperture_start = 0;
5003         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5004         domain->geometry.force_aperture = true;
5005
5006         return domain;
5007 }
5008
5009 static void intel_iommu_domain_free(struct iommu_domain *domain)
5010 {
5011         domain_exit(to_dmar_domain(domain));
5012 }
5013
5014 static int intel_iommu_attach_device(struct iommu_domain *domain,
5015                                      struct device *dev)
5016 {
5017         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5018         struct intel_iommu *iommu;
5019         int addr_width;
5020         u8 bus, devfn;
5021
5022         if (device_is_rmrr_locked(dev)) {
5023                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5024                 return -EPERM;
5025         }
5026
5027         /* normally dev is not mapped */
5028         if (unlikely(domain_context_mapped(dev))) {
5029                 struct dmar_domain *old_domain;
5030
5031                 old_domain = find_domain(dev);
5032                 if (old_domain) {
5033                         rcu_read_lock();
5034                         dmar_remove_one_dev_info(old_domain, dev);
5035                         rcu_read_unlock();
5036
5037                         if (!domain_type_is_vm_or_si(old_domain) &&
5038                              list_empty(&old_domain->devices))
5039                                 domain_exit(old_domain);
5040                 }
5041         }
5042
5043         iommu = device_to_iommu(dev, &bus, &devfn);
5044         if (!iommu)
5045                 return -ENODEV;
5046
5047         /* check if this iommu agaw is sufficient for max mapped address */
5048         addr_width = agaw_to_width(iommu->agaw);
5049         if (addr_width > cap_mgaw(iommu->cap))
5050                 addr_width = cap_mgaw(iommu->cap);
5051
5052         if (dmar_domain->max_addr > (1LL << addr_width)) {
5053                 pr_err("%s: iommu width (%d) is not "
5054                        "sufficient for the mapped address (%llx)\n",
5055                        __func__, addr_width, dmar_domain->max_addr);
5056                 return -EFAULT;
5057         }
5058         dmar_domain->gaw = addr_width;
5059
5060         /*
5061          * Knock out extra levels of page tables if necessary
5062          */
5063         while (iommu->agaw < dmar_domain->agaw) {
5064                 struct dma_pte *pte;
5065
5066                 pte = dmar_domain->pgd;
5067                 if (dma_pte_present(pte)) {
5068                         dmar_domain->pgd = (struct dma_pte *)
5069                                 phys_to_virt(dma_pte_addr(pte));
5070                         free_pgtable_page(pte);
5071                 }
5072                 dmar_domain->agaw--;
5073         }
5074
5075         return domain_add_dev_info(dmar_domain, dev);
5076 }
5077
5078 static void intel_iommu_detach_device(struct iommu_domain *domain,
5079                                       struct device *dev)
5080 {
5081         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5082 }
5083
5084 static int intel_iommu_map(struct iommu_domain *domain,
5085                            unsigned long iova, phys_addr_t hpa,
5086                            size_t size, int iommu_prot)
5087 {
5088         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5089         u64 max_addr;
5090         int prot = 0;
5091         int ret;
5092
5093         if (iommu_prot & IOMMU_READ)
5094                 prot |= DMA_PTE_READ;
5095         if (iommu_prot & IOMMU_WRITE)
5096                 prot |= DMA_PTE_WRITE;
5097         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5098                 prot |= DMA_PTE_SNP;
5099
5100         max_addr = iova + size;
5101         if (dmar_domain->max_addr < max_addr) {
5102                 u64 end;
5103
5104                 /* check if minimum agaw is sufficient for mapped address */
5105                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5106                 if (end < max_addr) {
5107                         pr_err("%s: iommu width (%d) is not "
5108                                "sufficient for the mapped address (%llx)\n",
5109                                __func__, dmar_domain->gaw, max_addr);
5110                         return -EFAULT;
5111                 }
5112                 dmar_domain->max_addr = max_addr;
5113         }
5114         /* Round up size to next multiple of PAGE_SIZE, if it and
5115            the low bits of hpa would take us onto the next page */
5116         size = aligned_nrpages(hpa, size);
5117         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5118                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5119         return ret;
5120 }
5121
5122 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5123                                 unsigned long iova, size_t size)
5124 {
5125         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5126         struct page *freelist = NULL;
5127         unsigned long start_pfn, last_pfn;
5128         unsigned int npages;
5129         int iommu_id, level = 0;
5130
5131         /* Cope with horrid API which requires us to unmap more than the
5132            size argument if it happens to be a large-page mapping. */
5133         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5134
5135         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5136                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5137
5138         start_pfn = iova >> VTD_PAGE_SHIFT;
5139         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5140
5141         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5142
5143         npages = last_pfn - start_pfn + 1;
5144
5145         for_each_domain_iommu(iommu_id, dmar_domain)
5146                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5147                                       start_pfn, npages, !freelist, 0);
5148
5149         dma_free_pagelist(freelist);
5150
5151         if (dmar_domain->max_addr == iova + size)
5152                 dmar_domain->max_addr = iova;
5153
5154         return size;
5155 }
5156
5157 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5158                                             dma_addr_t iova)
5159 {
5160         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5161         struct dma_pte *pte;
5162         int level = 0;
5163         u64 phys = 0;
5164
5165         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5166         if (pte)
5167                 phys = dma_pte_addr(pte);
5168
5169         return phys;
5170 }
5171
5172 static bool intel_iommu_capable(enum iommu_cap cap)
5173 {
5174         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5175                 return domain_update_iommu_snooping(NULL) == 1;
5176         if (cap == IOMMU_CAP_INTR_REMAP)
5177                 return irq_remapping_enabled == 1;
5178
5179         return false;
5180 }
5181
5182 static int intel_iommu_add_device(struct device *dev)
5183 {
5184         struct intel_iommu *iommu;
5185         struct iommu_group *group;
5186         u8 bus, devfn;
5187
5188         iommu = device_to_iommu(dev, &bus, &devfn);
5189         if (!iommu)
5190                 return -ENODEV;
5191
5192         iommu_device_link(&iommu->iommu, dev);
5193
5194         group = iommu_group_get_for_dev(dev);
5195
5196         if (IS_ERR(group))
5197                 return PTR_ERR(group);
5198
5199         iommu_group_put(group);
5200         return 0;
5201 }
5202
5203 static void intel_iommu_remove_device(struct device *dev)
5204 {
5205         struct intel_iommu *iommu;
5206         u8 bus, devfn;
5207
5208         iommu = device_to_iommu(dev, &bus, &devfn);
5209         if (!iommu)
5210                 return;
5211
5212         iommu_group_remove_device(dev);
5213
5214         iommu_device_unlink(&iommu->iommu, dev);
5215 }
5216
5217 static void intel_iommu_get_resv_regions(struct device *device,
5218                                          struct list_head *head)
5219 {
5220         struct iommu_resv_region *reg;
5221         struct dmar_rmrr_unit *rmrr;
5222         struct device *i_dev;
5223         int i;
5224
5225         rcu_read_lock();
5226         for_each_rmrr_units(rmrr) {
5227                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5228                                           i, i_dev) {
5229                         if (i_dev != device)
5230                                 continue;
5231
5232                         list_add_tail(&rmrr->resv->list, head);
5233                 }
5234         }
5235         rcu_read_unlock();
5236
5237         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5238                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5239                                       0, IOMMU_RESV_MSI);
5240         if (!reg)
5241                 return;
5242         list_add_tail(&reg->list, head);
5243 }
5244
5245 static void intel_iommu_put_resv_regions(struct device *dev,
5246                                          struct list_head *head)
5247 {
5248         struct iommu_resv_region *entry, *next;
5249
5250         list_for_each_entry_safe(entry, next, head, list) {
5251                 if (entry->type == IOMMU_RESV_RESERVED)
5252                         kfree(entry);
5253         }
5254 }
5255
5256 #ifdef CONFIG_INTEL_IOMMU_SVM
5257 #define MAX_NR_PASID_BITS (20)
5258 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5259 {
5260         int pts, max_pasid;
5261
5262         max_pasid = intel_pasid_get_dev_max_id(dev);
5263         pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5264         if (pts < 5)
5265                 return 0;
5266
5267         return pts - 5;
5268 }
5269
5270 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5271 {
5272         struct device_domain_info *info;
5273         struct context_entry *context;
5274         struct dmar_domain *domain;
5275         unsigned long flags;
5276         u64 ctx_lo;
5277         int ret;
5278
5279         domain = get_valid_domain_for_dev(sdev->dev);
5280         if (!domain)
5281                 return -EINVAL;
5282
5283         spin_lock_irqsave(&device_domain_lock, flags);
5284         spin_lock(&iommu->lock);
5285
5286         ret = -EINVAL;
5287         info = sdev->dev->archdata.iommu;
5288         if (!info || !info->pasid_supported)
5289                 goto out;
5290
5291         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5292         if (WARN_ON(!context))
5293                 goto out;
5294
5295         ctx_lo = context[0].lo;
5296
5297         sdev->did = domain->iommu_did[iommu->seq_id];
5298         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5299
5300         if (!(ctx_lo & CONTEXT_PASIDE)) {
5301                 if (iommu->pasid_state_table)
5302                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5303                 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5304                         intel_iommu_get_pts(sdev->dev);
5305
5306                 wmb();
5307                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5308                  * extended to permit requests-with-PASID if the PASIDE bit
5309                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5310                  * however, the PASIDE bit is ignored and requests-with-PASID
5311                  * are unconditionally blocked. Which makes less sense.
5312                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5313                  * "guest mode" translation types depending on whether ATS
5314                  * is available or not. Annoyingly, we can't use the new
5315                  * modes *unless* PASIDE is set. */
5316                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5317                         ctx_lo &= ~CONTEXT_TT_MASK;
5318                         if (info->ats_supported)
5319                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5320                         else
5321                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5322                 }
5323                 ctx_lo |= CONTEXT_PASIDE;
5324                 if (iommu->pasid_state_table)
5325                         ctx_lo |= CONTEXT_DINVE;
5326                 if (info->pri_supported)
5327                         ctx_lo |= CONTEXT_PRS;
5328                 context[0].lo = ctx_lo;
5329                 wmb();
5330                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5331                                            DMA_CCMD_MASK_NOBIT,
5332                                            DMA_CCMD_DEVICE_INVL);
5333         }
5334
5335         /* Enable PASID support in the device, if it wasn't already */
5336         if (!info->pasid_enabled)
5337                 iommu_enable_dev_iotlb(info);
5338
5339         if (info->ats_enabled) {
5340                 sdev->dev_iotlb = 1;
5341                 sdev->qdep = info->ats_qdep;
5342                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5343                         sdev->qdep = 0;
5344         }
5345         ret = 0;
5346
5347  out:
5348         spin_unlock(&iommu->lock);
5349         spin_unlock_irqrestore(&device_domain_lock, flags);
5350
5351         return ret;
5352 }
5353
5354 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5355 {
5356         struct intel_iommu *iommu;
5357         u8 bus, devfn;
5358
5359         if (iommu_dummy(dev)) {
5360                 dev_warn(dev,
5361                          "No IOMMU translation for device; cannot enable SVM\n");
5362                 return NULL;
5363         }
5364
5365         iommu = device_to_iommu(dev, &bus, &devfn);
5366         if ((!iommu)) {
5367                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5368                 return NULL;
5369         }
5370
5371         return iommu;
5372 }
5373 #endif /* CONFIG_INTEL_IOMMU_SVM */
5374
5375 const struct iommu_ops intel_iommu_ops = {
5376         .capable                = intel_iommu_capable,
5377         .domain_alloc           = intel_iommu_domain_alloc,
5378         .domain_free            = intel_iommu_domain_free,
5379         .attach_dev             = intel_iommu_attach_device,
5380         .detach_dev             = intel_iommu_detach_device,
5381         .map                    = intel_iommu_map,
5382         .unmap                  = intel_iommu_unmap,
5383         .iova_to_phys           = intel_iommu_iova_to_phys,
5384         .add_device             = intel_iommu_add_device,
5385         .remove_device          = intel_iommu_remove_device,
5386         .get_resv_regions       = intel_iommu_get_resv_regions,
5387         .put_resv_regions       = intel_iommu_put_resv_regions,
5388         .device_group           = pci_device_group,
5389         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5390 };
5391
5392 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5393 {
5394         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5395         pr_info("Disabling IOMMU for graphics on this chipset\n");
5396         dmar_map_gfx = 0;
5397 }
5398
5399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5401 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5402 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5404 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5405 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5406
5407 static void quirk_iommu_rwbf(struct pci_dev *dev)
5408 {
5409         /*
5410          * Mobile 4 Series Chipset neglects to set RWBF capability,
5411          * but needs it. Same seems to hold for the desktop versions.
5412          */
5413         pr_info("Forcing write-buffer flush capability\n");
5414         rwbf_quirk = 1;
5415 }
5416
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5421 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5422 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5424
5425 #define GGC 0x52
5426 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5427 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5428 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5429 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5430 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5431 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5432 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5433 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5434
5435 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5436 {
5437         unsigned short ggc;
5438
5439         if (pci_read_config_word(dev, GGC, &ggc))
5440                 return;
5441
5442         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5443                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5444                 dmar_map_gfx = 0;
5445         } else if (dmar_map_gfx) {
5446                 /* we have to ensure the gfx device is idle before we flush */
5447                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5448                 intel_iommu_strict = 1;
5449        }
5450 }
5451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5455
5456 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5457    ISOCH DMAR unit for the Azalia sound device, but not give it any
5458    TLB entries, which causes it to deadlock. Check for that.  We do
5459    this in a function called from init_dmars(), instead of in a PCI
5460    quirk, because we don't want to print the obnoxious "BIOS broken"
5461    message if VT-d is actually disabled.
5462 */
5463 static void __init check_tylersburg_isoch(void)
5464 {
5465         struct pci_dev *pdev;
5466         uint32_t vtisochctrl;
5467
5468         /* If there's no Azalia in the system anyway, forget it. */
5469         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5470         if (!pdev)
5471                 return;
5472         pci_dev_put(pdev);
5473
5474         /* System Management Registers. Might be hidden, in which case
5475            we can't do the sanity check. But that's OK, because the
5476            known-broken BIOSes _don't_ actually hide it, so far. */
5477         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5478         if (!pdev)
5479                 return;
5480
5481         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5482                 pci_dev_put(pdev);
5483                 return;
5484         }
5485
5486         pci_dev_put(pdev);
5487
5488         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5489         if (vtisochctrl & 1)
5490                 return;
5491
5492         /* Drop all bits other than the number of TLB entries */
5493         vtisochctrl &= 0x1c;
5494
5495         /* If we have the recommended number of TLB entries (16), fine. */
5496         if (vtisochctrl == 0x10)
5497                 return;
5498
5499         /* Zero TLB entries? You get to ride the short bus to school. */
5500         if (!vtisochctrl) {
5501                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5502                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5503                      dmi_get_system_info(DMI_BIOS_VENDOR),
5504                      dmi_get_system_info(DMI_BIOS_VERSION),
5505                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5506                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5507                 return;
5508         }
5509
5510         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5511                vtisochctrl);
5512 }