OSDN Git Service

smb3: Add defines for new information level, FileIdInformation
[tomoyo/tomoyo-test1.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 #define for_each_domain_iommu(idx, domain)                      \
311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
312                 if (domain->iommu_refcnt[idx])
313
314 struct dmar_rmrr_unit {
315         struct list_head list;          /* list of rmrr units   */
316         struct acpi_dmar_header *hdr;   /* ACPI header          */
317         u64     base_address;           /* reserved base address*/
318         u64     end_address;            /* reserved end address */
319         struct dmar_dev_scope *devices; /* target devices */
320         int     devices_cnt;            /* target device count */
321 };
322
323 struct dmar_atsr_unit {
324         struct list_head list;          /* list of ATSR units */
325         struct acpi_dmar_header *hdr;   /* ACPI header */
326         struct dmar_dev_scope *devices; /* target devices */
327         int devices_cnt;                /* target device count */
328         u8 include_all:1;               /* include all ports */
329 };
330
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
333
334 #define for_each_rmrr_units(rmrr) \
335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
339
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345                                  struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347                                struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350                                      struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352                                             dma_addr_t iova);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
370
371 #define IDENTMAP_ALL            1
372 #define IDENTMAP_GFX            2
373 #define IDENTMAP_AZALIA         4
374
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
377
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
384                                 to_pci_dev(d)->untrusted)
385
386 /*
387  * Iterate over elements in device_domain_list and call the specified
388  * callback @fn against each element.
389  */
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391                                      void *data), void *data)
392 {
393         int ret = 0;
394         unsigned long flags;
395         struct device_domain_info *info;
396
397         spin_lock_irqsave(&device_domain_lock, flags);
398         list_for_each_entry(info, &device_domain_list, global) {
399                 ret = fn(info, data);
400                 if (ret) {
401                         spin_unlock_irqrestore(&device_domain_lock, flags);
402                         return ret;
403                 }
404         }
405         spin_unlock_irqrestore(&device_domain_lock, flags);
406
407         return 0;
408 }
409
410 const struct iommu_ops intel_iommu_ops;
411
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 {
414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 }
416
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 }
421
422 static void init_translation_status(struct intel_iommu *iommu)
423 {
424         u32 gsts;
425
426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
427         if (gsts & DMA_GSTS_TES)
428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 }
430
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 {
434         return container_of(dom, struct dmar_domain, domain);
435 }
436
437 static int __init intel_iommu_setup(char *str)
438 {
439         if (!str)
440                 return -EINVAL;
441         while (*str) {
442                 if (!strncmp(str, "on", 2)) {
443                         dmar_disabled = 0;
444                         pr_info("IOMMU enabled\n");
445                 } else if (!strncmp(str, "off", 3)) {
446                         dmar_disabled = 1;
447                         no_platform_optin = 1;
448                         pr_info("IOMMU disabled\n");
449                 } else if (!strncmp(str, "igfx_off", 8)) {
450                         dmar_map_gfx = 0;
451                         pr_info("Disable GFX device mapping\n");
452                 } else if (!strncmp(str, "forcedac", 8)) {
453                         pr_info("Forcing DAC for PCI devices\n");
454                         dmar_forcedac = 1;
455                 } else if (!strncmp(str, "strict", 6)) {
456                         pr_info("Disable batched IOTLB flush\n");
457                         intel_iommu_strict = 1;
458                 } else if (!strncmp(str, "sp_off", 6)) {
459                         pr_info("Disable supported super page\n");
460                         intel_iommu_superpage = 0;
461                 } else if (!strncmp(str, "sm_on", 5)) {
462                         pr_info("Intel-IOMMU: scalable mode supported\n");
463                         intel_iommu_sm = 1;
464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467                         intel_iommu_tboot_noforce = 1;
468                 } else if (!strncmp(str, "nobounce", 8)) {
469                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470                         intel_no_bounce = 1;
471                 }
472
473                 str += strcspn(str, ",");
474                 while (*str == ',')
475                         str++;
476         }
477         return 0;
478 }
479 __setup("intel_iommu=", intel_iommu_setup);
480
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
483
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         domains = iommu->domains[idx];
490         if (!domains)
491                 return NULL;
492
493         return domains[did & 0xff];
494 }
495
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497                              struct dmar_domain *domain)
498 {
499         struct dmar_domain **domains;
500         int idx = did >> 8;
501
502         if (!iommu->domains[idx]) {
503                 size_t size = 256 * sizeof(struct dmar_domain *);
504                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
505         }
506
507         domains = iommu->domains[idx];
508         if (WARN_ON(!domains))
509                 return;
510         else
511                 domains[did & 0xff] = domain;
512 }
513
514 void *alloc_pgtable_page(int node)
515 {
516         struct page *page;
517         void *vaddr = NULL;
518
519         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520         if (page)
521                 vaddr = page_address(page);
522         return vaddr;
523 }
524
525 void free_pgtable_page(void *vaddr)
526 {
527         free_page((unsigned long)vaddr);
528 }
529
530 static inline void *alloc_domain_mem(void)
531 {
532         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
533 }
534
535 static void free_domain_mem(void *vaddr)
536 {
537         kmem_cache_free(iommu_domain_cache, vaddr);
538 }
539
540 static inline void * alloc_devinfo_mem(void)
541 {
542         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
543 }
544
545 static inline void free_devinfo_mem(void *vaddr)
546 {
547         kmem_cache_free(iommu_devinfo_cache, vaddr);
548 }
549
550 static inline int domain_type_is_si(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
615 {
616         struct dmar_drhd_unit *drhd;
617         struct intel_iommu *iommu;
618         bool found = false;
619         int i;
620
621         domain->iommu_coherency = 1;
622
623         for_each_domain_iommu(i, domain) {
624                 found = true;
625                 if (!ecap_coherent(g_iommus[i]->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         if (found)
631                 return;
632
633         /* No hardware attached; use lowest common denominator */
634         rcu_read_lock();
635         for_each_active_iommu(iommu, drhd) {
636                 if (!ecap_coherent(iommu->ecap)) {
637                         domain->iommu_coherency = 0;
638                         break;
639                 }
640         }
641         rcu_read_unlock();
642 }
643
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
645 {
646         struct dmar_drhd_unit *drhd;
647         struct intel_iommu *iommu;
648         int ret = 1;
649
650         rcu_read_lock();
651         for_each_active_iommu(iommu, drhd) {
652                 if (iommu != skip) {
653                         if (!ecap_sc_support(iommu->ecap)) {
654                                 ret = 0;
655                                 break;
656                         }
657                 }
658         }
659         rcu_read_unlock();
660
661         return ret;
662 }
663
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
665 {
666         struct dmar_drhd_unit *drhd;
667         struct intel_iommu *iommu;
668         int mask = 0xf;
669
670         if (!intel_iommu_superpage) {
671                 return 0;
672         }
673
674         /* set iommu_superpage to the smallest common denominator */
675         rcu_read_lock();
676         for_each_active_iommu(iommu, drhd) {
677                 if (iommu != skip) {
678                         mask &= cap_super_page_val(iommu->cap);
679                         if (!mask)
680                                 break;
681                 }
682         }
683         rcu_read_unlock();
684
685         return fls(mask);
686 }
687
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
690 {
691         domain_update_iommu_coherency(domain);
692         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
694 }
695
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697                                          u8 devfn, int alloc)
698 {
699         struct root_entry *root = &iommu->root_entry[bus];
700         struct context_entry *context;
701         u64 *entry;
702
703         entry = &root->lo;
704         if (sm_supported(iommu)) {
705                 if (devfn >= 0x80) {
706                         devfn -= 0x80;
707                         entry = &root->hi;
708                 }
709                 devfn *= 2;
710         }
711         if (*entry & 1)
712                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713         else {
714                 unsigned long phy_addr;
715                 if (!alloc)
716                         return NULL;
717
718                 context = alloc_pgtable_page(iommu->node);
719                 if (!context)
720                         return NULL;
721
722                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723                 phy_addr = virt_to_phys((void *)context);
724                 *entry = phy_addr | 1;
725                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
726         }
727         return &context[devfn];
728 }
729
730 static int iommu_dummy(struct device *dev)
731 {
732         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
733 }
734
735 /**
736  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737  *                               sub-hierarchy of a candidate PCI-PCI bridge
738  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739  * @bridge: the candidate PCI-PCI bridge
740  *
741  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
742  */
743 static bool
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
745 {
746         struct pci_dev *pdev, *pbridge;
747
748         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749                 return false;
750
751         pdev = to_pci_dev(dev);
752         pbridge = to_pci_dev(bridge);
753
754         if (pbridge->subordinate &&
755             pbridge->subordinate->number <= pdev->bus->number &&
756             pbridge->subordinate->busn_res.end >= pdev->bus->number)
757                 return true;
758
759         return false;
760 }
761
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
763 {
764         struct dmar_drhd_unit *drhd = NULL;
765         struct intel_iommu *iommu;
766         struct device *tmp;
767         struct pci_dev *pdev = NULL;
768         u16 segment = 0;
769         int i;
770
771         if (iommu_dummy(dev))
772                 return NULL;
773
774         if (dev_is_pci(dev)) {
775                 struct pci_dev *pf_pdev;
776
777                 pdev = pci_real_dma_dev(to_pci_dev(dev));
778
779                 /* VFs aren't listed in scope tables; we need to look up
780                  * the PF instead to find the IOMMU. */
781                 pf_pdev = pci_physfn(pdev);
782                 dev = &pf_pdev->dev;
783                 segment = pci_domain_nr(pdev->bus);
784         } else if (has_acpi_companion(dev))
785                 dev = &ACPI_COMPANION(dev)->dev;
786
787         rcu_read_lock();
788         for_each_active_iommu(iommu, drhd) {
789                 if (pdev && segment != drhd->segment)
790                         continue;
791
792                 for_each_active_dev_scope(drhd->devices,
793                                           drhd->devices_cnt, i, tmp) {
794                         if (tmp == dev) {
795                                 /* For a VF use its original BDF# not that of the PF
796                                  * which we used for the IOMMU lookup. Strictly speaking
797                                  * we could do this for all PCI devices; we only need to
798                                  * get the BDF# from the scope table for ACPI matches. */
799                                 if (pdev && pdev->is_virtfn)
800                                         goto got_pdev;
801
802                                 *bus = drhd->devices[i].bus;
803                                 *devfn = drhd->devices[i].devfn;
804                                 goto out;
805                         }
806
807                         if (is_downstream_to_pci_bridge(dev, tmp))
808                                 goto got_pdev;
809                 }
810
811                 if (pdev && drhd->include_all) {
812                 got_pdev:
813                         *bus = pdev->bus->number;
814                         *devfn = pdev->devfn;
815                         goto out;
816                 }
817         }
818         iommu = NULL;
819  out:
820         rcu_read_unlock();
821
822         return iommu;
823 }
824
825 static void domain_flush_cache(struct dmar_domain *domain,
826                                void *addr, int size)
827 {
828         if (!domain->iommu_coherency)
829                 clflush_cache_range(addr, size);
830 }
831
832 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
833 {
834         struct context_entry *context;
835         int ret = 0;
836         unsigned long flags;
837
838         spin_lock_irqsave(&iommu->lock, flags);
839         context = iommu_context_addr(iommu, bus, devfn, 0);
840         if (context)
841                 ret = context_present(context);
842         spin_unlock_irqrestore(&iommu->lock, flags);
843         return ret;
844 }
845
846 static void free_context_table(struct intel_iommu *iommu)
847 {
848         int i;
849         unsigned long flags;
850         struct context_entry *context;
851
852         spin_lock_irqsave(&iommu->lock, flags);
853         if (!iommu->root_entry) {
854                 goto out;
855         }
856         for (i = 0; i < ROOT_ENTRY_NR; i++) {
857                 context = iommu_context_addr(iommu, i, 0, 0);
858                 if (context)
859                         free_pgtable_page(context);
860
861                 if (!sm_supported(iommu))
862                         continue;
863
864                 context = iommu_context_addr(iommu, i, 0x80, 0);
865                 if (context)
866                         free_pgtable_page(context);
867
868         }
869         free_pgtable_page(iommu->root_entry);
870         iommu->root_entry = NULL;
871 out:
872         spin_unlock_irqrestore(&iommu->lock, flags);
873 }
874
875 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
876                                       unsigned long pfn, int *target_level)
877 {
878         struct dma_pte *parent, *pte;
879         int level = agaw_to_level(domain->agaw);
880         int offset;
881
882         BUG_ON(!domain->pgd);
883
884         if (!domain_pfn_supported(domain, pfn))
885                 /* Address beyond IOMMU's addressing capabilities. */
886                 return NULL;
887
888         parent = domain->pgd;
889
890         while (1) {
891                 void *tmp_page;
892
893                 offset = pfn_level_offset(pfn, level);
894                 pte = &parent[offset];
895                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
896                         break;
897                 if (level == *target_level)
898                         break;
899
900                 if (!dma_pte_present(pte)) {
901                         uint64_t pteval;
902
903                         tmp_page = alloc_pgtable_page(domain->nid);
904
905                         if (!tmp_page)
906                                 return NULL;
907
908                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
909                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
910                         if (cmpxchg64(&pte->val, 0ULL, pteval))
911                                 /* Someone else set it while we were thinking; use theirs. */
912                                 free_pgtable_page(tmp_page);
913                         else
914                                 domain_flush_cache(domain, pte, sizeof(*pte));
915                 }
916                 if (level == 1)
917                         break;
918
919                 parent = phys_to_virt(dma_pte_addr(pte));
920                 level--;
921         }
922
923         if (!*target_level)
924                 *target_level = level;
925
926         return pte;
927 }
928
929 /* return address's pte at specific level */
930 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
931                                          unsigned long pfn,
932                                          int level, int *large_page)
933 {
934         struct dma_pte *parent, *pte;
935         int total = agaw_to_level(domain->agaw);
936         int offset;
937
938         parent = domain->pgd;
939         while (level <= total) {
940                 offset = pfn_level_offset(pfn, total);
941                 pte = &parent[offset];
942                 if (level == total)
943                         return pte;
944
945                 if (!dma_pte_present(pte)) {
946                         *large_page = total;
947                         break;
948                 }
949
950                 if (dma_pte_superpage(pte)) {
951                         *large_page = total;
952                         return pte;
953                 }
954
955                 parent = phys_to_virt(dma_pte_addr(pte));
956                 total--;
957         }
958         return NULL;
959 }
960
961 /* clear last level pte, a tlb flush should be followed */
962 static void dma_pte_clear_range(struct dmar_domain *domain,
963                                 unsigned long start_pfn,
964                                 unsigned long last_pfn)
965 {
966         unsigned int large_page;
967         struct dma_pte *first_pte, *pte;
968
969         BUG_ON(!domain_pfn_supported(domain, start_pfn));
970         BUG_ON(!domain_pfn_supported(domain, last_pfn));
971         BUG_ON(start_pfn > last_pfn);
972
973         /* we don't need lock here; nobody else touches the iova range */
974         do {
975                 large_page = 1;
976                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
977                 if (!pte) {
978                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
979                         continue;
980                 }
981                 do {
982                         dma_clear_pte(pte);
983                         start_pfn += lvl_to_nr_pages(large_page);
984                         pte++;
985                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
986
987                 domain_flush_cache(domain, first_pte,
988                                    (void *)pte - (void *)first_pte);
989
990         } while (start_pfn && start_pfn <= last_pfn);
991 }
992
993 static void dma_pte_free_level(struct dmar_domain *domain, int level,
994                                int retain_level, struct dma_pte *pte,
995                                unsigned long pfn, unsigned long start_pfn,
996                                unsigned long last_pfn)
997 {
998         pfn = max(start_pfn, pfn);
999         pte = &pte[pfn_level_offset(pfn, level)];
1000
1001         do {
1002                 unsigned long level_pfn;
1003                 struct dma_pte *level_pte;
1004
1005                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1006                         goto next;
1007
1008                 level_pfn = pfn & level_mask(level);
1009                 level_pte = phys_to_virt(dma_pte_addr(pte));
1010
1011                 if (level > 2) {
1012                         dma_pte_free_level(domain, level - 1, retain_level,
1013                                            level_pte, level_pfn, start_pfn,
1014                                            last_pfn);
1015                 }
1016
1017                 /*
1018                  * Free the page table if we're below the level we want to
1019                  * retain and the range covers the entire table.
1020                  */
1021                 if (level < retain_level && !(start_pfn > level_pfn ||
1022                       last_pfn < level_pfn + level_size(level) - 1)) {
1023                         dma_clear_pte(pte);
1024                         domain_flush_cache(domain, pte, sizeof(*pte));
1025                         free_pgtable_page(level_pte);
1026                 }
1027 next:
1028                 pfn += level_size(level);
1029         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1030 }
1031
1032 /*
1033  * clear last level (leaf) ptes and free page table pages below the
1034  * level we wish to keep intact.
1035  */
1036 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1037                                    unsigned long start_pfn,
1038                                    unsigned long last_pfn,
1039                                    int retain_level)
1040 {
1041         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1042         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1043         BUG_ON(start_pfn > last_pfn);
1044
1045         dma_pte_clear_range(domain, start_pfn, last_pfn);
1046
1047         /* We don't need lock here; nobody else touches the iova range */
1048         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1049                            domain->pgd, 0, start_pfn, last_pfn);
1050
1051         /* free pgd */
1052         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1053                 free_pgtable_page(domain->pgd);
1054                 domain->pgd = NULL;
1055         }
1056 }
1057
1058 /* When a page at a given level is being unlinked from its parent, we don't
1059    need to *modify* it at all. All we need to do is make a list of all the
1060    pages which can be freed just as soon as we've flushed the IOTLB and we
1061    know the hardware page-walk will no longer touch them.
1062    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1063    be freed. */
1064 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1065                                             int level, struct dma_pte *pte,
1066                                             struct page *freelist)
1067 {
1068         struct page *pg;
1069
1070         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1071         pg->freelist = freelist;
1072         freelist = pg;
1073
1074         if (level == 1)
1075                 return freelist;
1076
1077         pte = page_address(pg);
1078         do {
1079                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1080                         freelist = dma_pte_list_pagetables(domain, level - 1,
1081                                                            pte, freelist);
1082                 pte++;
1083         } while (!first_pte_in_page(pte));
1084
1085         return freelist;
1086 }
1087
1088 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1089                                         struct dma_pte *pte, unsigned long pfn,
1090                                         unsigned long start_pfn,
1091                                         unsigned long last_pfn,
1092                                         struct page *freelist)
1093 {
1094         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1095
1096         pfn = max(start_pfn, pfn);
1097         pte = &pte[pfn_level_offset(pfn, level)];
1098
1099         do {
1100                 unsigned long level_pfn;
1101
1102                 if (!dma_pte_present(pte))
1103                         goto next;
1104
1105                 level_pfn = pfn & level_mask(level);
1106
1107                 /* If range covers entire pagetable, free it */
1108                 if (start_pfn <= level_pfn &&
1109                     last_pfn >= level_pfn + level_size(level) - 1) {
1110                         /* These suborbinate page tables are going away entirely. Don't
1111                            bother to clear them; we're just going to *free* them. */
1112                         if (level > 1 && !dma_pte_superpage(pte))
1113                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114
1115                         dma_clear_pte(pte);
1116                         if (!first_pte)
1117                                 first_pte = pte;
1118                         last_pte = pte;
1119                 } else if (level > 1) {
1120                         /* Recurse down into a level that isn't *entirely* obsolete */
1121                         freelist = dma_pte_clear_level(domain, level - 1,
1122                                                        phys_to_virt(dma_pte_addr(pte)),
1123                                                        level_pfn, start_pfn, last_pfn,
1124                                                        freelist);
1125                 }
1126 next:
1127                 pfn += level_size(level);
1128         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1129
1130         if (first_pte)
1131                 domain_flush_cache(domain, first_pte,
1132                                    (void *)++last_pte - (void *)first_pte);
1133
1134         return freelist;
1135 }
1136
1137 /* We can't just free the pages because the IOMMU may still be walking
1138    the page tables, and may have cached the intermediate levels. The
1139    pages can only be freed after the IOTLB flush has been done. */
1140 static struct page *domain_unmap(struct dmar_domain *domain,
1141                                  unsigned long start_pfn,
1142                                  unsigned long last_pfn)
1143 {
1144         struct page *freelist;
1145
1146         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1147         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1148         BUG_ON(start_pfn > last_pfn);
1149
1150         /* we don't need lock here; nobody else touches the iova range */
1151         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1152                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1153
1154         /* free pgd */
1155         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1156                 struct page *pgd_page = virt_to_page(domain->pgd);
1157                 pgd_page->freelist = freelist;
1158                 freelist = pgd_page;
1159
1160                 domain->pgd = NULL;
1161         }
1162
1163         return freelist;
1164 }
1165
1166 static void dma_free_pagelist(struct page *freelist)
1167 {
1168         struct page *pg;
1169
1170         while ((pg = freelist)) {
1171                 freelist = pg->freelist;
1172                 free_pgtable_page(page_address(pg));
1173         }
1174 }
1175
1176 static void iova_entry_free(unsigned long data)
1177 {
1178         struct page *freelist = (struct page *)data;
1179
1180         dma_free_pagelist(freelist);
1181 }
1182
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1185 {
1186         struct root_entry *root;
1187         unsigned long flags;
1188
1189         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190         if (!root) {
1191                 pr_err("Allocating root entry for %s failed\n",
1192                         iommu->name);
1193                 return -ENOMEM;
1194         }
1195
1196         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1197
1198         spin_lock_irqsave(&iommu->lock, flags);
1199         iommu->root_entry = root;
1200         spin_unlock_irqrestore(&iommu->lock, flags);
1201
1202         return 0;
1203 }
1204
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 {
1207         u64 addr;
1208         u32 sts;
1209         unsigned long flag;
1210
1211         addr = virt_to_phys(iommu->root_entry);
1212         if (sm_supported(iommu))
1213                 addr |= DMA_RTADDR_SMT;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1217
1218         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1219
1220         /* Make sure hardware complete it */
1221         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222                       readl, (sts & DMA_GSTS_RTPS), sts);
1223
1224         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1225 }
1226
1227 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1228 {
1229         u32 val;
1230         unsigned long flag;
1231
1232         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1233                 return;
1234
1235         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1237
1238         /* Make sure hardware complete it */
1239         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                       readl, (!(val & DMA_GSTS_WBFS)), val);
1241
1242         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243 }
1244
1245 /* return value determine if we need a write buffer flush */
1246 static void __iommu_flush_context(struct intel_iommu *iommu,
1247                                   u16 did, u16 source_id, u8 function_mask,
1248                                   u64 type)
1249 {
1250         u64 val = 0;
1251         unsigned long flag;
1252
1253         switch (type) {
1254         case DMA_CCMD_GLOBAL_INVL:
1255                 val = DMA_CCMD_GLOBAL_INVL;
1256                 break;
1257         case DMA_CCMD_DOMAIN_INVL:
1258                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1259                 break;
1260         case DMA_CCMD_DEVICE_INVL:
1261                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1262                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1263                 break;
1264         default:
1265                 BUG();
1266         }
1267         val |= DMA_CCMD_ICC;
1268
1269         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1270         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1271
1272         /* Make sure hardware complete it */
1273         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1274                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1275
1276         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1277 }
1278
1279 /* return value determine if we need a write buffer flush */
1280 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1281                                 u64 addr, unsigned int size_order, u64 type)
1282 {
1283         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1284         u64 val = 0, val_iva = 0;
1285         unsigned long flag;
1286
1287         switch (type) {
1288         case DMA_TLB_GLOBAL_FLUSH:
1289                 /* global flush doesn't need set IVA_REG */
1290                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1291                 break;
1292         case DMA_TLB_DSI_FLUSH:
1293                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1294                 break;
1295         case DMA_TLB_PSI_FLUSH:
1296                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1297                 /* IH bit is passed in as part of address */
1298                 val_iva = size_order | addr;
1299                 break;
1300         default:
1301                 BUG();
1302         }
1303         /* Note: set drain read/write */
1304 #if 0
1305         /*
1306          * This is probably to be super secure.. Looks like we can
1307          * ignore it without any impact.
1308          */
1309         if (cap_read_drain(iommu->cap))
1310                 val |= DMA_TLB_READ_DRAIN;
1311 #endif
1312         if (cap_write_drain(iommu->cap))
1313                 val |= DMA_TLB_WRITE_DRAIN;
1314
1315         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1316         /* Note: Only uses first TLB reg currently */
1317         if (val_iva)
1318                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1319         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1320
1321         /* Make sure hardware complete it */
1322         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1323                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1324
1325         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1326
1327         /* check IOTLB invalidation granularity */
1328         if (DMA_TLB_IAIG(val) == 0)
1329                 pr_err("Flush IOTLB failed\n");
1330         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1331                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1332                         (unsigned long long)DMA_TLB_IIRG(type),
1333                         (unsigned long long)DMA_TLB_IAIG(val));
1334 }
1335
1336 static struct device_domain_info *
1337 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1338                          u8 bus, u8 devfn)
1339 {
1340         struct device_domain_info *info;
1341
1342         assert_spin_locked(&device_domain_lock);
1343
1344         if (!iommu->qi)
1345                 return NULL;
1346
1347         list_for_each_entry(info, &domain->devices, link)
1348                 if (info->iommu == iommu && info->bus == bus &&
1349                     info->devfn == devfn) {
1350                         if (info->ats_supported && info->dev)
1351                                 return info;
1352                         break;
1353                 }
1354
1355         return NULL;
1356 }
1357
1358 static void domain_update_iotlb(struct dmar_domain *domain)
1359 {
1360         struct device_domain_info *info;
1361         bool has_iotlb_device = false;
1362
1363         assert_spin_locked(&device_domain_lock);
1364
1365         list_for_each_entry(info, &domain->devices, link) {
1366                 struct pci_dev *pdev;
1367
1368                 if (!info->dev || !dev_is_pci(info->dev))
1369                         continue;
1370
1371                 pdev = to_pci_dev(info->dev);
1372                 if (pdev->ats_enabled) {
1373                         has_iotlb_device = true;
1374                         break;
1375                 }
1376         }
1377
1378         domain->has_iotlb_device = has_iotlb_device;
1379 }
1380
1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1382 {
1383         struct pci_dev *pdev;
1384
1385         assert_spin_locked(&device_domain_lock);
1386
1387         if (!info || !dev_is_pci(info->dev))
1388                 return;
1389
1390         pdev = to_pci_dev(info->dev);
1391         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1392          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1393          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1394          * reserved, which should be set to 0.
1395          */
1396         if (!ecap_dit(info->iommu->ecap))
1397                 info->pfsid = 0;
1398         else {
1399                 struct pci_dev *pf_pdev;
1400
1401                 /* pdev will be returned if device is not a vf */
1402                 pf_pdev = pci_physfn(pdev);
1403                 info->pfsid = pci_dev_id(pf_pdev);
1404         }
1405
1406 #ifdef CONFIG_INTEL_IOMMU_SVM
1407         /* The PCIe spec, in its wisdom, declares that the behaviour of
1408            the device if you enable PASID support after ATS support is
1409            undefined. So always enable PASID support on devices which
1410            have it, even if we can't yet know if we're ever going to
1411            use it. */
1412         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1413                 info->pasid_enabled = 1;
1414
1415         if (info->pri_supported &&
1416             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1417             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1418                 info->pri_enabled = 1;
1419 #endif
1420         if (!pdev->untrusted && info->ats_supported &&
1421             pci_ats_page_aligned(pdev) &&
1422             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1423                 info->ats_enabled = 1;
1424                 domain_update_iotlb(info->domain);
1425                 info->ats_qdep = pci_ats_queue_depth(pdev);
1426         }
1427 }
1428
1429 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1430 {
1431         struct pci_dev *pdev;
1432
1433         assert_spin_locked(&device_domain_lock);
1434
1435         if (!dev_is_pci(info->dev))
1436                 return;
1437
1438         pdev = to_pci_dev(info->dev);
1439
1440         if (info->ats_enabled) {
1441                 pci_disable_ats(pdev);
1442                 info->ats_enabled = 0;
1443                 domain_update_iotlb(info->domain);
1444         }
1445 #ifdef CONFIG_INTEL_IOMMU_SVM
1446         if (info->pri_enabled) {
1447                 pci_disable_pri(pdev);
1448                 info->pri_enabled = 0;
1449         }
1450         if (info->pasid_enabled) {
1451                 pci_disable_pasid(pdev);
1452                 info->pasid_enabled = 0;
1453         }
1454 #endif
1455 }
1456
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458                                   u64 addr, unsigned mask)
1459 {
1460         u16 sid, qdep;
1461         unsigned long flags;
1462         struct device_domain_info *info;
1463
1464         if (!domain->has_iotlb_device)
1465                 return;
1466
1467         spin_lock_irqsave(&device_domain_lock, flags);
1468         list_for_each_entry(info, &domain->devices, link) {
1469                 if (!info->ats_enabled)
1470                         continue;
1471
1472                 sid = info->bus << 8 | info->devfn;
1473                 qdep = info->ats_qdep;
1474                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1475                                 qdep, addr, mask);
1476         }
1477         spin_unlock_irqrestore(&device_domain_lock, flags);
1478 }
1479
1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1481                                   struct dmar_domain *domain,
1482                                   unsigned long pfn, unsigned int pages,
1483                                   int ih, int map)
1484 {
1485         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1486         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1487         u16 did = domain->iommu_did[iommu->seq_id];
1488
1489         BUG_ON(pages == 0);
1490
1491         if (ih)
1492                 ih = 1 << 6;
1493         /*
1494          * Fallback to domain selective flush if no PSI support or the size is
1495          * too big.
1496          * PSI requires page size to be 2 ^ x, and the base address is naturally
1497          * aligned to the size
1498          */
1499         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1500                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1501                                                 DMA_TLB_DSI_FLUSH);
1502         else
1503                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1504                                                 DMA_TLB_PSI_FLUSH);
1505
1506         /*
1507          * In caching mode, changes of pages from non-present to present require
1508          * flush. However, device IOTLB doesn't need to be flushed in this case.
1509          */
1510         if (!cap_caching_mode(iommu->cap) || !map)
1511                 iommu_flush_dev_iotlb(domain, addr, mask);
1512 }
1513
1514 /* Notification for newly created mappings */
1515 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1516                                         struct dmar_domain *domain,
1517                                         unsigned long pfn, unsigned int pages)
1518 {
1519         /* It's a non-present to present mapping. Only flush if caching mode */
1520         if (cap_caching_mode(iommu->cap))
1521                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522         else
1523                 iommu_flush_write_buffer(iommu);
1524 }
1525
1526 static void iommu_flush_iova(struct iova_domain *iovad)
1527 {
1528         struct dmar_domain *domain;
1529         int idx;
1530
1531         domain = container_of(iovad, struct dmar_domain, iovad);
1532
1533         for_each_domain_iommu(idx, domain) {
1534                 struct intel_iommu *iommu = g_iommus[idx];
1535                 u16 did = domain->iommu_did[iommu->seq_id];
1536
1537                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1538
1539                 if (!cap_caching_mode(iommu->cap))
1540                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1541                                               0, MAX_AGAW_PFN_WIDTH);
1542         }
1543 }
1544
1545 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1546 {
1547         u32 pmen;
1548         unsigned long flags;
1549
1550         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1551                 return;
1552
1553         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1554         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1555         pmen &= ~DMA_PMEN_EPM;
1556         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1557
1558         /* wait for the protected region status bit to clear */
1559         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1560                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1561
1562         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1563 }
1564
1565 static void iommu_enable_translation(struct intel_iommu *iommu)
1566 {
1567         u32 sts;
1568         unsigned long flags;
1569
1570         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1571         iommu->gcmd |= DMA_GCMD_TE;
1572         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1573
1574         /* Make sure hardware complete it */
1575         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1576                       readl, (sts & DMA_GSTS_TES), sts);
1577
1578         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1579 }
1580
1581 static void iommu_disable_translation(struct intel_iommu *iommu)
1582 {
1583         u32 sts;
1584         unsigned long flag;
1585
1586         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1587         iommu->gcmd &= ~DMA_GCMD_TE;
1588         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1589
1590         /* Make sure hardware complete it */
1591         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1592                       readl, (!(sts & DMA_GSTS_TES)), sts);
1593
1594         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1595 }
1596
1597 static int iommu_init_domains(struct intel_iommu *iommu)
1598 {
1599         u32 ndomains, nlongs;
1600         size_t size;
1601
1602         ndomains = cap_ndoms(iommu->cap);
1603         pr_debug("%s: Number of Domains supported <%d>\n",
1604                  iommu->name, ndomains);
1605         nlongs = BITS_TO_LONGS(ndomains);
1606
1607         spin_lock_init(&iommu->lock);
1608
1609         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1610         if (!iommu->domain_ids) {
1611                 pr_err("%s: Allocating domain id array failed\n",
1612                        iommu->name);
1613                 return -ENOMEM;
1614         }
1615
1616         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1617         iommu->domains = kzalloc(size, GFP_KERNEL);
1618
1619         if (iommu->domains) {
1620                 size = 256 * sizeof(struct dmar_domain *);
1621                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1622         }
1623
1624         if (!iommu->domains || !iommu->domains[0]) {
1625                 pr_err("%s: Allocating domain array failed\n",
1626                        iommu->name);
1627                 kfree(iommu->domain_ids);
1628                 kfree(iommu->domains);
1629                 iommu->domain_ids = NULL;
1630                 iommu->domains    = NULL;
1631                 return -ENOMEM;
1632         }
1633
1634         /*
1635          * If Caching mode is set, then invalid translations are tagged
1636          * with domain-id 0, hence we need to pre-allocate it. We also
1637          * use domain-id 0 as a marker for non-allocated domain-id, so
1638          * make sure it is not used for a real domain.
1639          */
1640         set_bit(0, iommu->domain_ids);
1641
1642         /*
1643          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1644          * entry for first-level or pass-through translation modes should
1645          * be programmed with a domain id different from those used for
1646          * second-level or nested translation. We reserve a domain id for
1647          * this purpose.
1648          */
1649         if (sm_supported(iommu))
1650                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1651
1652         return 0;
1653 }
1654
1655 static void disable_dmar_iommu(struct intel_iommu *iommu)
1656 {
1657         struct device_domain_info *info, *tmp;
1658         unsigned long flags;
1659
1660         if (!iommu->domains || !iommu->domain_ids)
1661                 return;
1662
1663         spin_lock_irqsave(&device_domain_lock, flags);
1664         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1665                 if (info->iommu != iommu)
1666                         continue;
1667
1668                 if (!info->dev || !info->domain)
1669                         continue;
1670
1671                 __dmar_remove_one_dev_info(info);
1672         }
1673         spin_unlock_irqrestore(&device_domain_lock, flags);
1674
1675         if (iommu->gcmd & DMA_GCMD_TE)
1676                 iommu_disable_translation(iommu);
1677 }
1678
1679 static void free_dmar_iommu(struct intel_iommu *iommu)
1680 {
1681         if ((iommu->domains) && (iommu->domain_ids)) {
1682                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1683                 int i;
1684
1685                 for (i = 0; i < elems; i++)
1686                         kfree(iommu->domains[i]);
1687                 kfree(iommu->domains);
1688                 kfree(iommu->domain_ids);
1689                 iommu->domains = NULL;
1690                 iommu->domain_ids = NULL;
1691         }
1692
1693         g_iommus[iommu->seq_id] = NULL;
1694
1695         /* free context mapping */
1696         free_context_table(iommu);
1697
1698 #ifdef CONFIG_INTEL_IOMMU_SVM
1699         if (pasid_supported(iommu)) {
1700                 if (ecap_prs(iommu->ecap))
1701                         intel_svm_finish_prq(iommu);
1702         }
1703 #endif
1704 }
1705
1706 static struct dmar_domain *alloc_domain(int flags)
1707 {
1708         struct dmar_domain *domain;
1709
1710         domain = alloc_domain_mem();
1711         if (!domain)
1712                 return NULL;
1713
1714         memset(domain, 0, sizeof(*domain));
1715         domain->nid = NUMA_NO_NODE;
1716         domain->flags = flags;
1717         domain->has_iotlb_device = false;
1718         INIT_LIST_HEAD(&domain->devices);
1719
1720         return domain;
1721 }
1722
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725                                struct intel_iommu *iommu)
1726 {
1727         unsigned long ndomains;
1728         int num;
1729
1730         assert_spin_locked(&device_domain_lock);
1731         assert_spin_locked(&iommu->lock);
1732
1733         domain->iommu_refcnt[iommu->seq_id] += 1;
1734         domain->iommu_count += 1;
1735         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736                 ndomains = cap_ndoms(iommu->cap);
1737                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1738
1739                 if (num >= ndomains) {
1740                         pr_err("%s: No free domain ids\n", iommu->name);
1741                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1742                         domain->iommu_count -= 1;
1743                         return -ENOSPC;
1744                 }
1745
1746                 set_bit(num, iommu->domain_ids);
1747                 set_iommu_domain(iommu, num, domain);
1748
1749                 domain->iommu_did[iommu->seq_id] = num;
1750                 domain->nid                      = iommu->node;
1751
1752                 domain_update_iommu_cap(domain);
1753         }
1754
1755         return 0;
1756 }
1757
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759                                struct intel_iommu *iommu)
1760 {
1761         int num, count;
1762
1763         assert_spin_locked(&device_domain_lock);
1764         assert_spin_locked(&iommu->lock);
1765
1766         domain->iommu_refcnt[iommu->seq_id] -= 1;
1767         count = --domain->iommu_count;
1768         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769                 num = domain->iommu_did[iommu->seq_id];
1770                 clear_bit(num, iommu->domain_ids);
1771                 set_iommu_domain(iommu, num, NULL);
1772
1773                 domain_update_iommu_cap(domain);
1774                 domain->iommu_did[iommu->seq_id] = 0;
1775         }
1776
1777         return count;
1778 }
1779
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1782
1783 static int dmar_init_reserved_ranges(void)
1784 {
1785         struct pci_dev *pdev = NULL;
1786         struct iova *iova;
1787         int i;
1788
1789         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1790
1791         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792                 &reserved_rbtree_key);
1793
1794         /* IOAPIC ranges shouldn't be accessed by DMA */
1795         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796                 IOVA_PFN(IOAPIC_RANGE_END));
1797         if (!iova) {
1798                 pr_err("Reserve IOAPIC range failed\n");
1799                 return -ENODEV;
1800         }
1801
1802         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803         for_each_pci_dev(pdev) {
1804                 struct resource *r;
1805
1806                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807                         r = &pdev->resource[i];
1808                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1809                                 continue;
1810                         iova = reserve_iova(&reserved_iova_list,
1811                                             IOVA_PFN(r->start),
1812                                             IOVA_PFN(r->end));
1813                         if (!iova) {
1814                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815                                 return -ENODEV;
1816                         }
1817                 }
1818         }
1819         return 0;
1820 }
1821
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1823 {
1824         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1825 }
1826
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1828 {
1829         int agaw;
1830         int r = (gaw - 12) % 9;
1831
1832         if (r == 0)
1833                 agaw = gaw;
1834         else
1835                 agaw = gaw + 9 - r;
1836         if (agaw > 64)
1837                 agaw = 64;
1838         return agaw;
1839 }
1840
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1842                        int guest_width)
1843 {
1844         int adjust_width, agaw;
1845         unsigned long sagaw;
1846         int err;
1847
1848         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1849
1850         err = init_iova_flush_queue(&domain->iovad,
1851                                     iommu_flush_iova, iova_entry_free);
1852         if (err)
1853                 return err;
1854
1855         domain_reserve_special_ranges(domain);
1856
1857         /* calculate AGAW */
1858         if (guest_width > cap_mgaw(iommu->cap))
1859                 guest_width = cap_mgaw(iommu->cap);
1860         domain->gaw = guest_width;
1861         adjust_width = guestwidth_to_adjustwidth(guest_width);
1862         agaw = width_to_agaw(adjust_width);
1863         sagaw = cap_sagaw(iommu->cap);
1864         if (!test_bit(agaw, &sagaw)) {
1865                 /* hardware doesn't support it, choose a bigger one */
1866                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867                 agaw = find_next_bit(&sagaw, 5, agaw);
1868                 if (agaw >= 5)
1869                         return -ENODEV;
1870         }
1871         domain->agaw = agaw;
1872
1873         if (ecap_coherent(iommu->ecap))
1874                 domain->iommu_coherency = 1;
1875         else
1876                 domain->iommu_coherency = 0;
1877
1878         if (ecap_sc_support(iommu->ecap))
1879                 domain->iommu_snooping = 1;
1880         else
1881                 domain->iommu_snooping = 0;
1882
1883         if (intel_iommu_superpage)
1884                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1885         else
1886                 domain->iommu_superpage = 0;
1887
1888         domain->nid = iommu->node;
1889
1890         /* always allocate the top pgd */
1891         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1892         if (!domain->pgd)
1893                 return -ENOMEM;
1894         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1895         return 0;
1896 }
1897
1898 static void domain_exit(struct dmar_domain *domain)
1899 {
1900
1901         /* Remove associated devices and clear attached or cached domains */
1902         domain_remove_dev_info(domain);
1903
1904         /* destroy iovas */
1905         put_iova_domain(&domain->iovad);
1906
1907         if (domain->pgd) {
1908                 struct page *freelist;
1909
1910                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1911                 dma_free_pagelist(freelist);
1912         }
1913
1914         free_domain_mem(domain);
1915 }
1916
1917 /*
1918  * Get the PASID directory size for scalable mode context entry.
1919  * Value of X in the PDTS field of a scalable mode context entry
1920  * indicates PASID directory with 2^(X + 7) entries.
1921  */
1922 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1923 {
1924         int pds, max_pde;
1925
1926         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1927         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928         if (pds < 7)
1929                 return 0;
1930
1931         return pds - 7;
1932 }
1933
1934 /*
1935  * Set the RID_PASID field of a scalable mode context entry. The
1936  * IOMMU hardware will use the PASID value set in this field for
1937  * DMA translations of DMA requests without PASID.
1938  */
1939 static inline void
1940 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1941 {
1942         context->hi |= pasid & ((1 << 20) - 1);
1943         context->hi |= (1 << 20);
1944 }
1945
1946 /*
1947  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1948  * entry.
1949  */
1950 static inline void context_set_sm_dte(struct context_entry *context)
1951 {
1952         context->lo |= (1 << 2);
1953 }
1954
1955 /*
1956  * Set the PRE(Page Request Enable) field of a scalable mode context
1957  * entry.
1958  */
1959 static inline void context_set_sm_pre(struct context_entry *context)
1960 {
1961         context->lo |= (1 << 4);
1962 }
1963
1964 /* Convert value to context PASID directory size field coding. */
1965 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1966
1967 static int domain_context_mapping_one(struct dmar_domain *domain,
1968                                       struct intel_iommu *iommu,
1969                                       struct pasid_table *table,
1970                                       u8 bus, u8 devfn)
1971 {
1972         u16 did = domain->iommu_did[iommu->seq_id];
1973         int translation = CONTEXT_TT_MULTI_LEVEL;
1974         struct device_domain_info *info = NULL;
1975         struct context_entry *context;
1976         unsigned long flags;
1977         int ret;
1978
1979         WARN_ON(did == 0);
1980
1981         if (hw_pass_through && domain_type_is_si(domain))
1982                 translation = CONTEXT_TT_PASS_THROUGH;
1983
1984         pr_debug("Set context mapping for %02x:%02x.%d\n",
1985                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1986
1987         BUG_ON(!domain->pgd);
1988
1989         spin_lock_irqsave(&device_domain_lock, flags);
1990         spin_lock(&iommu->lock);
1991
1992         ret = -ENOMEM;
1993         context = iommu_context_addr(iommu, bus, devfn, 1);
1994         if (!context)
1995                 goto out_unlock;
1996
1997         ret = 0;
1998         if (context_present(context))
1999                 goto out_unlock;
2000
2001         /*
2002          * For kdump cases, old valid entries may be cached due to the
2003          * in-flight DMA and copied pgtable, but there is no unmapping
2004          * behaviour for them, thus we need an explicit cache flush for
2005          * the newly-mapped device. For kdump, at this point, the device
2006          * is supposed to finish reset at its driver probe stage, so no
2007          * in-flight DMA will exist, and we don't need to worry anymore
2008          * hereafter.
2009          */
2010         if (context_copied(context)) {
2011                 u16 did_old = context_domain_id(context);
2012
2013                 if (did_old < cap_ndoms(iommu->cap)) {
2014                         iommu->flush.flush_context(iommu, did_old,
2015                                                    (((u16)bus) << 8) | devfn,
2016                                                    DMA_CCMD_MASK_NOBIT,
2017                                                    DMA_CCMD_DEVICE_INVL);
2018                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2019                                                  DMA_TLB_DSI_FLUSH);
2020                 }
2021         }
2022
2023         context_clear_entry(context);
2024
2025         if (sm_supported(iommu)) {
2026                 unsigned long pds;
2027
2028                 WARN_ON(!table);
2029
2030                 /* Setup the PASID DIR pointer: */
2031                 pds = context_get_sm_pds(table);
2032                 context->lo = (u64)virt_to_phys(table->table) |
2033                                 context_pdts(pds);
2034
2035                 /* Setup the RID_PASID field: */
2036                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2037
2038                 /*
2039                  * Setup the Device-TLB enable bit and Page request
2040                  * Enable bit:
2041                  */
2042                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2043                 if (info && info->ats_supported)
2044                         context_set_sm_dte(context);
2045                 if (info && info->pri_supported)
2046                         context_set_sm_pre(context);
2047         } else {
2048                 struct dma_pte *pgd = domain->pgd;
2049                 int agaw;
2050
2051                 context_set_domain_id(context, did);
2052
2053                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2054                         /*
2055                          * Skip top levels of page tables for iommu which has
2056                          * less agaw than default. Unnecessary for PT mode.
2057                          */
2058                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2059                                 ret = -ENOMEM;
2060                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2061                                 if (!dma_pte_present(pgd))
2062                                         goto out_unlock;
2063                         }
2064
2065                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2066                         if (info && info->ats_supported)
2067                                 translation = CONTEXT_TT_DEV_IOTLB;
2068                         else
2069                                 translation = CONTEXT_TT_MULTI_LEVEL;
2070
2071                         context_set_address_root(context, virt_to_phys(pgd));
2072                         context_set_address_width(context, agaw);
2073                 } else {
2074                         /*
2075                          * In pass through mode, AW must be programmed to
2076                          * indicate the largest AGAW value supported by
2077                          * hardware. And ASR is ignored by hardware.
2078                          */
2079                         context_set_address_width(context, iommu->msagaw);
2080                 }
2081
2082                 context_set_translation_type(context, translation);
2083         }
2084
2085         context_set_fault_enable(context);
2086         context_set_present(context);
2087         domain_flush_cache(domain, context, sizeof(*context));
2088
2089         /*
2090          * It's a non-present to present mapping. If hardware doesn't cache
2091          * non-present entry we only need to flush the write-buffer. If the
2092          * _does_ cache non-present entries, then it does so in the special
2093          * domain #0, which we have to flush:
2094          */
2095         if (cap_caching_mode(iommu->cap)) {
2096                 iommu->flush.flush_context(iommu, 0,
2097                                            (((u16)bus) << 8) | devfn,
2098                                            DMA_CCMD_MASK_NOBIT,
2099                                            DMA_CCMD_DEVICE_INVL);
2100                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2101         } else {
2102                 iommu_flush_write_buffer(iommu);
2103         }
2104         iommu_enable_dev_iotlb(info);
2105
2106         ret = 0;
2107
2108 out_unlock:
2109         spin_unlock(&iommu->lock);
2110         spin_unlock_irqrestore(&device_domain_lock, flags);
2111
2112         return ret;
2113 }
2114
2115 struct domain_context_mapping_data {
2116         struct dmar_domain *domain;
2117         struct intel_iommu *iommu;
2118         struct pasid_table *table;
2119 };
2120
2121 static int domain_context_mapping_cb(struct pci_dev *pdev,
2122                                      u16 alias, void *opaque)
2123 {
2124         struct domain_context_mapping_data *data = opaque;
2125
2126         return domain_context_mapping_one(data->domain, data->iommu,
2127                                           data->table, PCI_BUS_NUM(alias),
2128                                           alias & 0xff);
2129 }
2130
2131 static int
2132 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2133 {
2134         struct domain_context_mapping_data data;
2135         struct pasid_table *table;
2136         struct intel_iommu *iommu;
2137         u8 bus, devfn;
2138
2139         iommu = device_to_iommu(dev, &bus, &devfn);
2140         if (!iommu)
2141                 return -ENODEV;
2142
2143         table = intel_pasid_get_table(dev);
2144
2145         if (!dev_is_pci(dev))
2146                 return domain_context_mapping_one(domain, iommu, table,
2147                                                   bus, devfn);
2148
2149         data.domain = domain;
2150         data.iommu = iommu;
2151         data.table = table;
2152
2153         return pci_for_each_dma_alias(to_pci_dev(dev),
2154                                       &domain_context_mapping_cb, &data);
2155 }
2156
2157 static int domain_context_mapped_cb(struct pci_dev *pdev,
2158                                     u16 alias, void *opaque)
2159 {
2160         struct intel_iommu *iommu = opaque;
2161
2162         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2163 }
2164
2165 static int domain_context_mapped(struct device *dev)
2166 {
2167         struct intel_iommu *iommu;
2168         u8 bus, devfn;
2169
2170         iommu = device_to_iommu(dev, &bus, &devfn);
2171         if (!iommu)
2172                 return -ENODEV;
2173
2174         if (!dev_is_pci(dev))
2175                 return device_context_mapped(iommu, bus, devfn);
2176
2177         return !pci_for_each_dma_alias(to_pci_dev(dev),
2178                                        domain_context_mapped_cb, iommu);
2179 }
2180
2181 /* Returns a number of VTD pages, but aligned to MM page size */
2182 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2183                                             size_t size)
2184 {
2185         host_addr &= ~PAGE_MASK;
2186         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2187 }
2188
2189 /* Return largest possible superpage level for a given mapping */
2190 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2191                                           unsigned long iov_pfn,
2192                                           unsigned long phy_pfn,
2193                                           unsigned long pages)
2194 {
2195         int support, level = 1;
2196         unsigned long pfnmerge;
2197
2198         support = domain->iommu_superpage;
2199
2200         /* To use a large page, the virtual *and* physical addresses
2201            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2202            of them will mean we have to use smaller pages. So just
2203            merge them and check both at once. */
2204         pfnmerge = iov_pfn | phy_pfn;
2205
2206         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2207                 pages >>= VTD_STRIDE_SHIFT;
2208                 if (!pages)
2209                         break;
2210                 pfnmerge >>= VTD_STRIDE_SHIFT;
2211                 level++;
2212                 support--;
2213         }
2214         return level;
2215 }
2216
2217 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2218                             struct scatterlist *sg, unsigned long phys_pfn,
2219                             unsigned long nr_pages, int prot)
2220 {
2221         struct dma_pte *first_pte = NULL, *pte = NULL;
2222         phys_addr_t uninitialized_var(pteval);
2223         unsigned long sg_res = 0;
2224         unsigned int largepage_lvl = 0;
2225         unsigned long lvl_pages = 0;
2226
2227         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2228
2229         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2230                 return -EINVAL;
2231
2232         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2233
2234         if (!sg) {
2235                 sg_res = nr_pages;
2236                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2237         }
2238
2239         while (nr_pages > 0) {
2240                 uint64_t tmp;
2241
2242                 if (!sg_res) {
2243                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2244
2245                         sg_res = aligned_nrpages(sg->offset, sg->length);
2246                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2247                         sg->dma_length = sg->length;
2248                         pteval = (sg_phys(sg) - pgoff) | prot;
2249                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2250                 }
2251
2252                 if (!pte) {
2253                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2254
2255                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2256                         if (!pte)
2257                                 return -ENOMEM;
2258                         /* It is large page*/
2259                         if (largepage_lvl > 1) {
2260                                 unsigned long nr_superpages, end_pfn;
2261
2262                                 pteval |= DMA_PTE_LARGE_PAGE;
2263                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2264
2265                                 nr_superpages = sg_res / lvl_pages;
2266                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2267
2268                                 /*
2269                                  * Ensure that old small page tables are
2270                                  * removed to make room for superpage(s).
2271                                  * We're adding new large pages, so make sure
2272                                  * we don't remove their parent tables.
2273                                  */
2274                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2275                                                        largepage_lvl + 1);
2276                         } else {
2277                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2278                         }
2279
2280                 }
2281                 /* We don't need lock here, nobody else
2282                  * touches the iova range
2283                  */
2284                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2285                 if (tmp) {
2286                         static int dumps = 5;
2287                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2288                                 iov_pfn, tmp, (unsigned long long)pteval);
2289                         if (dumps) {
2290                                 dumps--;
2291                                 debug_dma_dump_mappings(NULL);
2292                         }
2293                         WARN_ON(1);
2294                 }
2295
2296                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2297
2298                 BUG_ON(nr_pages < lvl_pages);
2299                 BUG_ON(sg_res < lvl_pages);
2300
2301                 nr_pages -= lvl_pages;
2302                 iov_pfn += lvl_pages;
2303                 phys_pfn += lvl_pages;
2304                 pteval += lvl_pages * VTD_PAGE_SIZE;
2305                 sg_res -= lvl_pages;
2306
2307                 /* If the next PTE would be the first in a new page, then we
2308                    need to flush the cache on the entries we've just written.
2309                    And then we'll need to recalculate 'pte', so clear it and
2310                    let it get set again in the if (!pte) block above.
2311
2312                    If we're done (!nr_pages) we need to flush the cache too.
2313
2314                    Also if we've been setting superpages, we may need to
2315                    recalculate 'pte' and switch back to smaller pages for the
2316                    end of the mapping, if the trailing size is not enough to
2317                    use another superpage (i.e. sg_res < lvl_pages). */
2318                 pte++;
2319                 if (!nr_pages || first_pte_in_page(pte) ||
2320                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2321                         domain_flush_cache(domain, first_pte,
2322                                            (void *)pte - (void *)first_pte);
2323                         pte = NULL;
2324                 }
2325
2326                 if (!sg_res && nr_pages)
2327                         sg = sg_next(sg);
2328         }
2329         return 0;
2330 }
2331
2332 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2333                           struct scatterlist *sg, unsigned long phys_pfn,
2334                           unsigned long nr_pages, int prot)
2335 {
2336         int iommu_id, ret;
2337         struct intel_iommu *iommu;
2338
2339         /* Do the real mapping first */
2340         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2341         if (ret)
2342                 return ret;
2343
2344         for_each_domain_iommu(iommu_id, domain) {
2345                 iommu = g_iommus[iommu_id];
2346                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2347         }
2348
2349         return 0;
2350 }
2351
2352 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353                                     struct scatterlist *sg, unsigned long nr_pages,
2354                                     int prot)
2355 {
2356         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2357 }
2358
2359 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2360                                      unsigned long phys_pfn, unsigned long nr_pages,
2361                                      int prot)
2362 {
2363         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2364 }
2365
2366 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2367 {
2368         unsigned long flags;
2369         struct context_entry *context;
2370         u16 did_old;
2371
2372         if (!iommu)
2373                 return;
2374
2375         spin_lock_irqsave(&iommu->lock, flags);
2376         context = iommu_context_addr(iommu, bus, devfn, 0);
2377         if (!context) {
2378                 spin_unlock_irqrestore(&iommu->lock, flags);
2379                 return;
2380         }
2381         did_old = context_domain_id(context);
2382         context_clear_entry(context);
2383         __iommu_flush_cache(iommu, context, sizeof(*context));
2384         spin_unlock_irqrestore(&iommu->lock, flags);
2385         iommu->flush.flush_context(iommu,
2386                                    did_old,
2387                                    (((u16)bus) << 8) | devfn,
2388                                    DMA_CCMD_MASK_NOBIT,
2389                                    DMA_CCMD_DEVICE_INVL);
2390         iommu->flush.flush_iotlb(iommu,
2391                                  did_old,
2392                                  0,
2393                                  0,
2394                                  DMA_TLB_DSI_FLUSH);
2395 }
2396
2397 static inline void unlink_domain_info(struct device_domain_info *info)
2398 {
2399         assert_spin_locked(&device_domain_lock);
2400         list_del(&info->link);
2401         list_del(&info->global);
2402         if (info->dev)
2403                 info->dev->archdata.iommu = NULL;
2404 }
2405
2406 static void domain_remove_dev_info(struct dmar_domain *domain)
2407 {
2408         struct device_domain_info *info, *tmp;
2409         unsigned long flags;
2410
2411         spin_lock_irqsave(&device_domain_lock, flags);
2412         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2413                 __dmar_remove_one_dev_info(info);
2414         spin_unlock_irqrestore(&device_domain_lock, flags);
2415 }
2416
2417 static struct dmar_domain *find_domain(struct device *dev)
2418 {
2419         struct device_domain_info *info;
2420
2421         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2422                      dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2423                 return NULL;
2424
2425         if (dev_is_pci(dev))
2426                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2427
2428         /* No lock here, assumes no domain exit in normal case */
2429         info = dev->archdata.iommu;
2430         if (likely(info))
2431                 return info->domain;
2432
2433         return NULL;
2434 }
2435
2436 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2437 {
2438         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2439                 struct iommu_domain *domain;
2440
2441                 dev->archdata.iommu = NULL;
2442                 domain = iommu_get_domain_for_dev(dev);
2443                 if (domain)
2444                         intel_iommu_attach_device(domain, dev);
2445         }
2446
2447         return find_domain(dev);
2448 }
2449
2450 static inline struct device_domain_info *
2451 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2452 {
2453         struct device_domain_info *info;
2454
2455         list_for_each_entry(info, &device_domain_list, global)
2456                 if (info->iommu->segment == segment && info->bus == bus &&
2457                     info->devfn == devfn)
2458                         return info;
2459
2460         return NULL;
2461 }
2462
2463 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2464                                                     int bus, int devfn,
2465                                                     struct device *dev,
2466                                                     struct dmar_domain *domain)
2467 {
2468         struct dmar_domain *found = NULL;
2469         struct device_domain_info *info;
2470         unsigned long flags;
2471         int ret;
2472
2473         info = alloc_devinfo_mem();
2474         if (!info)
2475                 return NULL;
2476
2477         info->bus = bus;
2478         info->devfn = devfn;
2479         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2480         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2481         info->ats_qdep = 0;
2482         info->dev = dev;
2483         info->domain = domain;
2484         info->iommu = iommu;
2485         info->pasid_table = NULL;
2486         info->auxd_enabled = 0;
2487         INIT_LIST_HEAD(&info->auxiliary_domains);
2488
2489         if (dev && dev_is_pci(dev)) {
2490                 struct pci_dev *pdev = to_pci_dev(info->dev);
2491
2492                 if (!pdev->untrusted &&
2493                     !pci_ats_disabled() &&
2494                     ecap_dev_iotlb_support(iommu->ecap) &&
2495                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2496                     dmar_find_matched_atsr_unit(pdev))
2497                         info->ats_supported = 1;
2498
2499                 if (sm_supported(iommu)) {
2500                         if (pasid_supported(iommu)) {
2501                                 int features = pci_pasid_features(pdev);
2502                                 if (features >= 0)
2503                                         info->pasid_supported = features | 1;
2504                         }
2505
2506                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2507                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2508                                 info->pri_supported = 1;
2509                 }
2510         }
2511
2512         spin_lock_irqsave(&device_domain_lock, flags);
2513         if (dev)
2514                 found = find_domain(dev);
2515
2516         if (!found) {
2517                 struct device_domain_info *info2;
2518                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2519                 if (info2) {
2520                         found      = info2->domain;
2521                         info2->dev = dev;
2522                 }
2523         }
2524
2525         if (found) {
2526                 spin_unlock_irqrestore(&device_domain_lock, flags);
2527                 free_devinfo_mem(info);
2528                 /* Caller must free the original domain */
2529                 return found;
2530         }
2531
2532         spin_lock(&iommu->lock);
2533         ret = domain_attach_iommu(domain, iommu);
2534         spin_unlock(&iommu->lock);
2535
2536         if (ret) {
2537                 spin_unlock_irqrestore(&device_domain_lock, flags);
2538                 free_devinfo_mem(info);
2539                 return NULL;
2540         }
2541
2542         list_add(&info->link, &domain->devices);
2543         list_add(&info->global, &device_domain_list);
2544         if (dev)
2545                 dev->archdata.iommu = info;
2546         spin_unlock_irqrestore(&device_domain_lock, flags);
2547
2548         /* PASID table is mandatory for a PCI device in scalable mode. */
2549         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2550                 ret = intel_pasid_alloc_table(dev);
2551                 if (ret) {
2552                         dev_err(dev, "PASID table allocation failed\n");
2553                         dmar_remove_one_dev_info(dev);
2554                         return NULL;
2555                 }
2556
2557                 /* Setup the PASID entry for requests without PASID: */
2558                 spin_lock(&iommu->lock);
2559                 if (hw_pass_through && domain_type_is_si(domain))
2560                         ret = intel_pasid_setup_pass_through(iommu, domain,
2561                                         dev, PASID_RID2PASID);
2562                 else
2563                         ret = intel_pasid_setup_second_level(iommu, domain,
2564                                         dev, PASID_RID2PASID);
2565                 spin_unlock(&iommu->lock);
2566                 if (ret) {
2567                         dev_err(dev, "Setup RID2PASID failed\n");
2568                         dmar_remove_one_dev_info(dev);
2569                         return NULL;
2570                 }
2571         }
2572
2573         if (dev && domain_context_mapping(domain, dev)) {
2574                 dev_err(dev, "Domain context map failed\n");
2575                 dmar_remove_one_dev_info(dev);
2576                 return NULL;
2577         }
2578
2579         return domain;
2580 }
2581
2582 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2583 {
2584         *(u16 *)opaque = alias;
2585         return 0;
2586 }
2587
2588 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2589 {
2590         struct device_domain_info *info;
2591         struct dmar_domain *domain = NULL;
2592         struct intel_iommu *iommu;
2593         u16 dma_alias;
2594         unsigned long flags;
2595         u8 bus, devfn;
2596
2597         iommu = device_to_iommu(dev, &bus, &devfn);
2598         if (!iommu)
2599                 return NULL;
2600
2601         if (dev_is_pci(dev)) {
2602                 struct pci_dev *pdev = to_pci_dev(dev);
2603
2604                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2605
2606                 spin_lock_irqsave(&device_domain_lock, flags);
2607                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2608                                                       PCI_BUS_NUM(dma_alias),
2609                                                       dma_alias & 0xff);
2610                 if (info) {
2611                         iommu = info->iommu;
2612                         domain = info->domain;
2613                 }
2614                 spin_unlock_irqrestore(&device_domain_lock, flags);
2615
2616                 /* DMA alias already has a domain, use it */
2617                 if (info)
2618                         goto out;
2619         }
2620
2621         /* Allocate and initialize new domain for the device */
2622         domain = alloc_domain(0);
2623         if (!domain)
2624                 return NULL;
2625         if (domain_init(domain, iommu, gaw)) {
2626                 domain_exit(domain);
2627                 return NULL;
2628         }
2629
2630 out:
2631         return domain;
2632 }
2633
2634 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2635                                               struct dmar_domain *domain)
2636 {
2637         struct intel_iommu *iommu;
2638         struct dmar_domain *tmp;
2639         u16 req_id, dma_alias;
2640         u8 bus, devfn;
2641
2642         iommu = device_to_iommu(dev, &bus, &devfn);
2643         if (!iommu)
2644                 return NULL;
2645
2646         req_id = ((u16)bus << 8) | devfn;
2647
2648         if (dev_is_pci(dev)) {
2649                 struct pci_dev *pdev = to_pci_dev(dev);
2650
2651                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2652
2653                 /* register PCI DMA alias device */
2654                 if (req_id != dma_alias) {
2655                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2656                                         dma_alias & 0xff, NULL, domain);
2657
2658                         if (!tmp || tmp != domain)
2659                                 return tmp;
2660                 }
2661         }
2662
2663         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2664         if (!tmp || tmp != domain)
2665                 return tmp;
2666
2667         return domain;
2668 }
2669
2670 static int iommu_domain_identity_map(struct dmar_domain *domain,
2671                                      unsigned long long start,
2672                                      unsigned long long end)
2673 {
2674         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2675         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2676
2677         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2678                           dma_to_mm_pfn(last_vpfn))) {
2679                 pr_err("Reserving iova failed\n");
2680                 return -ENOMEM;
2681         }
2682
2683         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2684         /*
2685          * RMRR range might have overlap with physical memory range,
2686          * clear it first
2687          */
2688         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2689
2690         return __domain_mapping(domain, first_vpfn, NULL,
2691                                 first_vpfn, last_vpfn - first_vpfn + 1,
2692                                 DMA_PTE_READ|DMA_PTE_WRITE);
2693 }
2694
2695 static int domain_prepare_identity_map(struct device *dev,
2696                                        struct dmar_domain *domain,
2697                                        unsigned long long start,
2698                                        unsigned long long end)
2699 {
2700         /* For _hardware_ passthrough, don't bother. But for software
2701            passthrough, we do it anyway -- it may indicate a memory
2702            range which is reserved in E820, so which didn't get set
2703            up to start with in si_domain */
2704         if (domain == si_domain && hw_pass_through) {
2705                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2706                          start, end);
2707                 return 0;
2708         }
2709
2710         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2711
2712         if (end < start) {
2713                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2714                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2715                         dmi_get_system_info(DMI_BIOS_VENDOR),
2716                         dmi_get_system_info(DMI_BIOS_VERSION),
2717                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2718                 return -EIO;
2719         }
2720
2721         if (end >> agaw_to_width(domain->agaw)) {
2722                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2723                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2724                      agaw_to_width(domain->agaw),
2725                      dmi_get_system_info(DMI_BIOS_VENDOR),
2726                      dmi_get_system_info(DMI_BIOS_VERSION),
2727                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2728                 return -EIO;
2729         }
2730
2731         return iommu_domain_identity_map(domain, start, end);
2732 }
2733
2734 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2735
2736 static int __init si_domain_init(int hw)
2737 {
2738         struct dmar_rmrr_unit *rmrr;
2739         struct device *dev;
2740         int i, nid, ret;
2741
2742         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2743         if (!si_domain)
2744                 return -EFAULT;
2745
2746         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2747                 domain_exit(si_domain);
2748                 return -EFAULT;
2749         }
2750
2751         if (hw)
2752                 return 0;
2753
2754         for_each_online_node(nid) {
2755                 unsigned long start_pfn, end_pfn;
2756                 int i;
2757
2758                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2759                         ret = iommu_domain_identity_map(si_domain,
2760                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2761                         if (ret)
2762                                 return ret;
2763                 }
2764         }
2765
2766         /*
2767          * Normally we use DMA domains for devices which have RMRRs. But we
2768          * loose this requirement for graphic and usb devices. Identity map
2769          * the RMRRs for graphic and USB devices so that they could use the
2770          * si_domain.
2771          */
2772         for_each_rmrr_units(rmrr) {
2773                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2774                                           i, dev) {
2775                         unsigned long long start = rmrr->base_address;
2776                         unsigned long long end = rmrr->end_address;
2777
2778                         if (device_is_rmrr_locked(dev))
2779                                 continue;
2780
2781                         if (WARN_ON(end < start ||
2782                                     end >> agaw_to_width(si_domain->agaw)))
2783                                 continue;
2784
2785                         ret = iommu_domain_identity_map(si_domain, start, end);
2786                         if (ret)
2787                                 return ret;
2788                 }
2789         }
2790
2791         return 0;
2792 }
2793
2794 static int identity_mapping(struct device *dev)
2795 {
2796         struct device_domain_info *info;
2797
2798         info = dev->archdata.iommu;
2799         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2800                 return (info->domain == si_domain);
2801
2802         return 0;
2803 }
2804
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806 {
2807         struct dmar_domain *ndomain;
2808         struct intel_iommu *iommu;
2809         u8 bus, devfn;
2810
2811         iommu = device_to_iommu(dev, &bus, &devfn);
2812         if (!iommu)
2813                 return -ENODEV;
2814
2815         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816         if (ndomain != domain)
2817                 return -EBUSY;
2818
2819         return 0;
2820 }
2821
2822 static bool device_has_rmrr(struct device *dev)
2823 {
2824         struct dmar_rmrr_unit *rmrr;
2825         struct device *tmp;
2826         int i;
2827
2828         rcu_read_lock();
2829         for_each_rmrr_units(rmrr) {
2830                 /*
2831                  * Return TRUE if this RMRR contains the device that
2832                  * is passed in.
2833                  */
2834                 for_each_active_dev_scope(rmrr->devices,
2835                                           rmrr->devices_cnt, i, tmp)
2836                         if (tmp == dev ||
2837                             is_downstream_to_pci_bridge(dev, tmp)) {
2838                                 rcu_read_unlock();
2839                                 return true;
2840                         }
2841         }
2842         rcu_read_unlock();
2843         return false;
2844 }
2845
2846 /**
2847  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848  * is relaxable (ie. is allowed to be not enforced under some conditions)
2849  * @dev: device handle
2850  *
2851  * We assume that PCI USB devices with RMRRs have them largely
2852  * for historical reasons and that the RMRR space is not actively used post
2853  * boot.  This exclusion may change if vendors begin to abuse it.
2854  *
2855  * The same exception is made for graphics devices, with the requirement that
2856  * any use of the RMRR regions will be torn down before assigning the device
2857  * to a guest.
2858  *
2859  * Return: true if the RMRR is relaxable, false otherwise
2860  */
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2862 {
2863         struct pci_dev *pdev;
2864
2865         if (!dev_is_pci(dev))
2866                 return false;
2867
2868         pdev = to_pci_dev(dev);
2869         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870                 return true;
2871         else
2872                 return false;
2873 }
2874
2875 /*
2876  * There are a couple cases where we need to restrict the functionality of
2877  * devices associated with RMRRs.  The first is when evaluating a device for
2878  * identity mapping because problems exist when devices are moved in and out
2879  * of domains and their respective RMRR information is lost.  This means that
2880  * a device with associated RMRRs will never be in a "passthrough" domain.
2881  * The second is use of the device through the IOMMU API.  This interface
2882  * expects to have full control of the IOVA space for the device.  We cannot
2883  * satisfy both the requirement that RMRR access is maintained and have an
2884  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2885  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886  * We therefore prevent devices associated with an RMRR from participating in
2887  * the IOMMU API, which eliminates them from device assignment.
2888  *
2889  * In both cases, devices which have relaxable RMRRs are not concerned by this
2890  * restriction. See device_rmrr_is_relaxable comment.
2891  */
2892 static bool device_is_rmrr_locked(struct device *dev)
2893 {
2894         if (!device_has_rmrr(dev))
2895                 return false;
2896
2897         if (device_rmrr_is_relaxable(dev))
2898                 return false;
2899
2900         return true;
2901 }
2902
2903 /*
2904  * Return the required default domain type for a specific device.
2905  *
2906  * @dev: the device in query
2907  * @startup: true if this is during early boot
2908  *
2909  * Returns:
2910  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912  *  - 0: both identity and dynamic domains work for this device
2913  */
2914 static int device_def_domain_type(struct device *dev)
2915 {
2916         if (dev_is_pci(dev)) {
2917                 struct pci_dev *pdev = to_pci_dev(dev);
2918
2919                 if (device_is_rmrr_locked(dev))
2920                         return IOMMU_DOMAIN_DMA;
2921
2922                 /*
2923                  * Prevent any device marked as untrusted from getting
2924                  * placed into the statically identity mapping domain.
2925                  */
2926                 if (pdev->untrusted)
2927                         return IOMMU_DOMAIN_DMA;
2928
2929                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2930                         return IOMMU_DOMAIN_IDENTITY;
2931
2932                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2933                         return IOMMU_DOMAIN_IDENTITY;
2934
2935                 /*
2936                  * We want to start off with all devices in the 1:1 domain, and
2937                  * take them out later if we find they can't access all of memory.
2938                  *
2939                  * However, we can't do this for PCI devices behind bridges,
2940                  * because all PCI devices behind the same bridge will end up
2941                  * with the same source-id on their transactions.
2942                  *
2943                  * Practically speaking, we can't change things around for these
2944                  * devices at run-time, because we can't be sure there'll be no
2945                  * DMA transactions in flight for any of their siblings.
2946                  *
2947                  * So PCI devices (unless they're on the root bus) as well as
2948                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2949                  * the 1:1 domain, just in _case_ one of their siblings turns out
2950                  * not to be able to map all of memory.
2951                  */
2952                 if (!pci_is_pcie(pdev)) {
2953                         if (!pci_is_root_bus(pdev->bus))
2954                                 return IOMMU_DOMAIN_DMA;
2955                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2956                                 return IOMMU_DOMAIN_DMA;
2957                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2958                         return IOMMU_DOMAIN_DMA;
2959         } else {
2960                 if (device_has_rmrr(dev))
2961                         return IOMMU_DOMAIN_DMA;
2962         }
2963
2964         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2965                         IOMMU_DOMAIN_IDENTITY : 0;
2966 }
2967
2968 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2969 {
2970         /*
2971          * Start from the sane iommu hardware state.
2972          * If the queued invalidation is already initialized by us
2973          * (for example, while enabling interrupt-remapping) then
2974          * we got the things already rolling from a sane state.
2975          */
2976         if (!iommu->qi) {
2977                 /*
2978                  * Clear any previous faults.
2979                  */
2980                 dmar_fault(-1, iommu);
2981                 /*
2982                  * Disable queued invalidation if supported and already enabled
2983                  * before OS handover.
2984                  */
2985                 dmar_disable_qi(iommu);
2986         }
2987
2988         if (dmar_enable_qi(iommu)) {
2989                 /*
2990                  * Queued Invalidate not enabled, use Register Based Invalidate
2991                  */
2992                 iommu->flush.flush_context = __iommu_flush_context;
2993                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2994                 pr_info("%s: Using Register based invalidation\n",
2995                         iommu->name);
2996         } else {
2997                 iommu->flush.flush_context = qi_flush_context;
2998                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2999                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3000         }
3001 }
3002
3003 static int copy_context_table(struct intel_iommu *iommu,
3004                               struct root_entry *old_re,
3005                               struct context_entry **tbl,
3006                               int bus, bool ext)
3007 {
3008         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3009         struct context_entry *new_ce = NULL, ce;
3010         struct context_entry *old_ce = NULL;
3011         struct root_entry re;
3012         phys_addr_t old_ce_phys;
3013
3014         tbl_idx = ext ? bus * 2 : bus;
3015         memcpy(&re, old_re, sizeof(re));
3016
3017         for (devfn = 0; devfn < 256; devfn++) {
3018                 /* First calculate the correct index */
3019                 idx = (ext ? devfn * 2 : devfn) % 256;
3020
3021                 if (idx == 0) {
3022                         /* First save what we may have and clean up */
3023                         if (new_ce) {
3024                                 tbl[tbl_idx] = new_ce;
3025                                 __iommu_flush_cache(iommu, new_ce,
3026                                                     VTD_PAGE_SIZE);
3027                                 pos = 1;
3028                         }
3029
3030                         if (old_ce)
3031                                 memunmap(old_ce);
3032
3033                         ret = 0;
3034                         if (devfn < 0x80)
3035                                 old_ce_phys = root_entry_lctp(&re);
3036                         else
3037                                 old_ce_phys = root_entry_uctp(&re);
3038
3039                         if (!old_ce_phys) {
3040                                 if (ext && devfn == 0) {
3041                                         /* No LCTP, try UCTP */
3042                                         devfn = 0x7f;
3043                                         continue;
3044                                 } else {
3045                                         goto out;
3046                                 }
3047                         }
3048
3049                         ret = -ENOMEM;
3050                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3051                                         MEMREMAP_WB);
3052                         if (!old_ce)
3053                                 goto out;
3054
3055                         new_ce = alloc_pgtable_page(iommu->node);
3056                         if (!new_ce)
3057                                 goto out_unmap;
3058
3059                         ret = 0;
3060                 }
3061
3062                 /* Now copy the context entry */
3063                 memcpy(&ce, old_ce + idx, sizeof(ce));
3064
3065                 if (!__context_present(&ce))
3066                         continue;
3067
3068                 did = context_domain_id(&ce);
3069                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3070                         set_bit(did, iommu->domain_ids);
3071
3072                 /*
3073                  * We need a marker for copied context entries. This
3074                  * marker needs to work for the old format as well as
3075                  * for extended context entries.
3076                  *
3077                  * Bit 67 of the context entry is used. In the old
3078                  * format this bit is available to software, in the
3079                  * extended format it is the PGE bit, but PGE is ignored
3080                  * by HW if PASIDs are disabled (and thus still
3081                  * available).
3082                  *
3083                  * So disable PASIDs first and then mark the entry
3084                  * copied. This means that we don't copy PASID
3085                  * translations from the old kernel, but this is fine as
3086                  * faults there are not fatal.
3087                  */
3088                 context_clear_pasid_enable(&ce);
3089                 context_set_copied(&ce);
3090
3091                 new_ce[idx] = ce;
3092         }
3093
3094         tbl[tbl_idx + pos] = new_ce;
3095
3096         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3097
3098 out_unmap:
3099         memunmap(old_ce);
3100
3101 out:
3102         return ret;
3103 }
3104
3105 static int copy_translation_tables(struct intel_iommu *iommu)
3106 {
3107         struct context_entry **ctxt_tbls;
3108         struct root_entry *old_rt;
3109         phys_addr_t old_rt_phys;
3110         int ctxt_table_entries;
3111         unsigned long flags;
3112         u64 rtaddr_reg;
3113         int bus, ret;
3114         bool new_ext, ext;
3115
3116         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3117         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3118         new_ext    = !!ecap_ecs(iommu->ecap);
3119
3120         /*
3121          * The RTT bit can only be changed when translation is disabled,
3122          * but disabling translation means to open a window for data
3123          * corruption. So bail out and don't copy anything if we would
3124          * have to change the bit.
3125          */
3126         if (new_ext != ext)
3127                 return -EINVAL;
3128
3129         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3130         if (!old_rt_phys)
3131                 return -EINVAL;
3132
3133         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3134         if (!old_rt)
3135                 return -ENOMEM;
3136
3137         /* This is too big for the stack - allocate it from slab */
3138         ctxt_table_entries = ext ? 512 : 256;
3139         ret = -ENOMEM;
3140         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3141         if (!ctxt_tbls)
3142                 goto out_unmap;
3143
3144         for (bus = 0; bus < 256; bus++) {
3145                 ret = copy_context_table(iommu, &old_rt[bus],
3146                                          ctxt_tbls, bus, ext);
3147                 if (ret) {
3148                         pr_err("%s: Failed to copy context table for bus %d\n",
3149                                 iommu->name, bus);
3150                         continue;
3151                 }
3152         }
3153
3154         spin_lock_irqsave(&iommu->lock, flags);
3155
3156         /* Context tables are copied, now write them to the root_entry table */
3157         for (bus = 0; bus < 256; bus++) {
3158                 int idx = ext ? bus * 2 : bus;
3159                 u64 val;
3160
3161                 if (ctxt_tbls[idx]) {
3162                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3163                         iommu->root_entry[bus].lo = val;
3164                 }
3165
3166                 if (!ext || !ctxt_tbls[idx + 1])
3167                         continue;
3168
3169                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3170                 iommu->root_entry[bus].hi = val;
3171         }
3172
3173         spin_unlock_irqrestore(&iommu->lock, flags);
3174
3175         kfree(ctxt_tbls);
3176
3177         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3178
3179         ret = 0;
3180
3181 out_unmap:
3182         memunmap(old_rt);
3183
3184         return ret;
3185 }
3186
3187 static int __init init_dmars(void)
3188 {
3189         struct dmar_drhd_unit *drhd;
3190         struct intel_iommu *iommu;
3191         int ret;
3192
3193         /*
3194          * for each drhd
3195          *    allocate root
3196          *    initialize and program root entry to not present
3197          * endfor
3198          */
3199         for_each_drhd_unit(drhd) {
3200                 /*
3201                  * lock not needed as this is only incremented in the single
3202                  * threaded kernel __init code path all other access are read
3203                  * only
3204                  */
3205                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3206                         g_num_of_iommus++;
3207                         continue;
3208                 }
3209                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3210         }
3211
3212         /* Preallocate enough resources for IOMMU hot-addition */
3213         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3214                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3215
3216         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3217                         GFP_KERNEL);
3218         if (!g_iommus) {
3219                 pr_err("Allocating global iommu array failed\n");
3220                 ret = -ENOMEM;
3221                 goto error;
3222         }
3223
3224         for_each_iommu(iommu, drhd) {
3225                 if (drhd->ignored) {
3226                         iommu_disable_translation(iommu);
3227                         continue;
3228                 }
3229
3230                 /*
3231                  * Find the max pasid size of all IOMMU's in the system.
3232                  * We need to ensure the system pasid table is no bigger
3233                  * than the smallest supported.
3234                  */
3235                 if (pasid_supported(iommu)) {
3236                         u32 temp = 2 << ecap_pss(iommu->ecap);
3237
3238                         intel_pasid_max_id = min_t(u32, temp,
3239                                                    intel_pasid_max_id);
3240                 }
3241
3242                 g_iommus[iommu->seq_id] = iommu;
3243
3244                 intel_iommu_init_qi(iommu);
3245
3246                 ret = iommu_init_domains(iommu);
3247                 if (ret)
3248                         goto free_iommu;
3249
3250                 init_translation_status(iommu);
3251
3252                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3253                         iommu_disable_translation(iommu);
3254                         clear_translation_pre_enabled(iommu);
3255                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3256                                 iommu->name);
3257                 }
3258
3259                 /*
3260                  * TBD:
3261                  * we could share the same root & context tables
3262                  * among all IOMMU's. Need to Split it later.
3263                  */
3264                 ret = iommu_alloc_root_entry(iommu);
3265                 if (ret)
3266                         goto free_iommu;
3267
3268                 if (translation_pre_enabled(iommu)) {
3269                         pr_info("Translation already enabled - trying to copy translation structures\n");
3270
3271                         ret = copy_translation_tables(iommu);
3272                         if (ret) {
3273                                 /*
3274                                  * We found the IOMMU with translation
3275                                  * enabled - but failed to copy over the
3276                                  * old root-entry table. Try to proceed
3277                                  * by disabling translation now and
3278                                  * allocating a clean root-entry table.
3279                                  * This might cause DMAR faults, but
3280                                  * probably the dump will still succeed.
3281                                  */
3282                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3283                                        iommu->name);
3284                                 iommu_disable_translation(iommu);
3285                                 clear_translation_pre_enabled(iommu);
3286                         } else {
3287                                 pr_info("Copied translation tables from previous kernel for %s\n",
3288                                         iommu->name);
3289                         }
3290                 }
3291
3292                 if (!ecap_pass_through(iommu->ecap))
3293                         hw_pass_through = 0;
3294 #ifdef CONFIG_INTEL_IOMMU_SVM
3295                 if (pasid_supported(iommu))
3296                         intel_svm_init(iommu);
3297 #endif
3298         }
3299
3300         /*
3301          * Now that qi is enabled on all iommus, set the root entry and flush
3302          * caches. This is required on some Intel X58 chipsets, otherwise the
3303          * flush_context function will loop forever and the boot hangs.
3304          */
3305         for_each_active_iommu(iommu, drhd) {
3306                 iommu_flush_write_buffer(iommu);
3307                 iommu_set_root_entry(iommu);
3308                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3309                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3310         }
3311
3312         if (iommu_default_passthrough())
3313                 iommu_identity_mapping |= IDENTMAP_ALL;
3314
3315 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3316         dmar_map_gfx = 0;
3317 #endif
3318
3319         if (!dmar_map_gfx)
3320                 iommu_identity_mapping |= IDENTMAP_GFX;
3321
3322         check_tylersburg_isoch();
3323
3324         ret = si_domain_init(hw_pass_through);
3325         if (ret)
3326                 goto free_iommu;
3327
3328         /*
3329          * for each drhd
3330          *   enable fault log
3331          *   global invalidate context cache
3332          *   global invalidate iotlb
3333          *   enable translation
3334          */
3335         for_each_iommu(iommu, drhd) {
3336                 if (drhd->ignored) {
3337                         /*
3338                          * we always have to disable PMRs or DMA may fail on
3339                          * this device
3340                          */
3341                         if (force_on)
3342                                 iommu_disable_protect_mem_regions(iommu);
3343                         continue;
3344                 }
3345
3346                 iommu_flush_write_buffer(iommu);
3347
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3350                         /*
3351                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3352                          * could cause possible lock race condition.
3353                          */
3354                         up_write(&dmar_global_lock);
3355                         ret = intel_svm_enable_prq(iommu);
3356                         down_write(&dmar_global_lock);
3357                         if (ret)
3358                                 goto free_iommu;
3359                 }
3360 #endif
3361                 ret = dmar_set_interrupt(iommu);
3362                 if (ret)
3363                         goto free_iommu;
3364         }
3365
3366         return 0;
3367
3368 free_iommu:
3369         for_each_active_iommu(iommu, drhd) {
3370                 disable_dmar_iommu(iommu);
3371                 free_dmar_iommu(iommu);
3372         }
3373
3374         kfree(g_iommus);
3375
3376 error:
3377         return ret;
3378 }
3379
3380 /* This takes a number of _MM_ pages, not VTD pages */
3381 static unsigned long intel_alloc_iova(struct device *dev,
3382                                      struct dmar_domain *domain,
3383                                      unsigned long nrpages, uint64_t dma_mask)
3384 {
3385         unsigned long iova_pfn;
3386
3387         /* Restrict dma_mask to the width that the iommu can handle */
3388         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3389         /* Ensure we reserve the whole size-aligned region */
3390         nrpages = __roundup_pow_of_two(nrpages);
3391
3392         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3393                 /*
3394                  * First try to allocate an io virtual address in
3395                  * DMA_BIT_MASK(32) and if that fails then try allocating
3396                  * from higher range
3397                  */
3398                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3399                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3400                 if (iova_pfn)
3401                         return iova_pfn;
3402         }
3403         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3404                                    IOVA_PFN(dma_mask), true);
3405         if (unlikely(!iova_pfn)) {
3406                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3407                 return 0;
3408         }
3409
3410         return iova_pfn;
3411 }
3412
3413 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3414 {
3415         struct dmar_domain *domain, *tmp;
3416         struct dmar_rmrr_unit *rmrr;
3417         struct device *i_dev;
3418         int i, ret;
3419
3420         /* Device shouldn't be attached by any domains. */
3421         domain = find_domain(dev);
3422         if (domain)
3423                 return NULL;
3424
3425         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3426         if (!domain)
3427                 goto out;
3428
3429         /* We have a new domain - setup possible RMRRs for the device */
3430         rcu_read_lock();
3431         for_each_rmrr_units(rmrr) {
3432                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3433                                           i, i_dev) {
3434                         if (i_dev != dev)
3435                                 continue;
3436
3437                         ret = domain_prepare_identity_map(dev, domain,
3438                                                           rmrr->base_address,
3439                                                           rmrr->end_address);
3440                         if (ret)
3441                                 dev_err(dev, "Mapping reserved region failed\n");
3442                 }
3443         }
3444         rcu_read_unlock();
3445
3446         tmp = set_domain_for_dev(dev, domain);
3447         if (!tmp || domain != tmp) {
3448                 domain_exit(domain);
3449                 domain = tmp;
3450         }
3451
3452 out:
3453         if (!domain)
3454                 dev_err(dev, "Allocating domain failed\n");
3455         else
3456                 domain->domain.type = IOMMU_DOMAIN_DMA;
3457
3458         return domain;
3459 }
3460
3461 /* Check if the dev needs to go through non-identity map and unmap process.*/
3462 static bool iommu_need_mapping(struct device *dev)
3463 {
3464         int ret;
3465
3466         if (iommu_dummy(dev))
3467                 return false;
3468
3469         ret = identity_mapping(dev);
3470         if (ret) {
3471                 u64 dma_mask = *dev->dma_mask;
3472
3473                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3474                         dma_mask = dev->coherent_dma_mask;
3475
3476                 if (dma_mask >= dma_direct_get_required_mask(dev))
3477                         return false;
3478
3479                 /*
3480                  * 32 bit DMA is removed from si_domain and fall back to
3481                  * non-identity mapping.
3482                  */
3483                 dmar_remove_one_dev_info(dev);
3484                 ret = iommu_request_dma_domain_for_dev(dev);
3485                 if (ret) {
3486                         struct iommu_domain *domain;
3487                         struct dmar_domain *dmar_domain;
3488
3489                         domain = iommu_get_domain_for_dev(dev);
3490                         if (domain) {
3491                                 dmar_domain = to_dmar_domain(domain);
3492                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3493                         }
3494                         dmar_remove_one_dev_info(dev);
3495                         get_private_domain_for_dev(dev);
3496                 }
3497
3498                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3499         }
3500
3501         return true;
3502 }
3503
3504 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3505                                      size_t size, int dir, u64 dma_mask)
3506 {
3507         struct dmar_domain *domain;
3508         phys_addr_t start_paddr;
3509         unsigned long iova_pfn;
3510         int prot = 0;
3511         int ret;
3512         struct intel_iommu *iommu;
3513         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3514
3515         BUG_ON(dir == DMA_NONE);
3516
3517         domain = deferred_attach_domain(dev);
3518         if (!domain)
3519                 return DMA_MAPPING_ERROR;
3520
3521         iommu = domain_get_iommu(domain);
3522         size = aligned_nrpages(paddr, size);
3523
3524         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3525         if (!iova_pfn)
3526                 goto error;
3527
3528         /*
3529          * Check if DMAR supports zero-length reads on write only
3530          * mappings..
3531          */
3532         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3533                         !cap_zlr(iommu->cap))
3534                 prot |= DMA_PTE_READ;
3535         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3536                 prot |= DMA_PTE_WRITE;
3537         /*
3538          * paddr - (paddr + size) might be partial page, we should map the whole
3539          * page.  Note: if two part of one page are separately mapped, we
3540          * might have two guest_addr mapping to the same host paddr, but this
3541          * is not a big problem
3542          */
3543         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3544                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3545         if (ret)
3546                 goto error;
3547
3548         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3549         start_paddr += paddr & ~PAGE_MASK;
3550
3551         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3552
3553         return start_paddr;
3554
3555 error:
3556         if (iova_pfn)
3557                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3558         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3559                 size, (unsigned long long)paddr, dir);
3560         return DMA_MAPPING_ERROR;
3561 }
3562
3563 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3564                                  unsigned long offset, size_t size,
3565                                  enum dma_data_direction dir,
3566                                  unsigned long attrs)
3567 {
3568         if (iommu_need_mapping(dev))
3569                 return __intel_map_single(dev, page_to_phys(page) + offset,
3570                                 size, dir, *dev->dma_mask);
3571         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3572 }
3573
3574 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3575                                      size_t size, enum dma_data_direction dir,
3576                                      unsigned long attrs)
3577 {
3578         if (iommu_need_mapping(dev))
3579                 return __intel_map_single(dev, phys_addr, size, dir,
3580                                 *dev->dma_mask);
3581         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3582 }
3583
3584 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3585 {
3586         struct dmar_domain *domain;
3587         unsigned long start_pfn, last_pfn;
3588         unsigned long nrpages;
3589         unsigned long iova_pfn;
3590         struct intel_iommu *iommu;
3591         struct page *freelist;
3592         struct pci_dev *pdev = NULL;
3593
3594         domain = find_domain(dev);
3595         BUG_ON(!domain);
3596
3597         iommu = domain_get_iommu(domain);
3598
3599         iova_pfn = IOVA_PFN(dev_addr);
3600
3601         nrpages = aligned_nrpages(dev_addr, size);
3602         start_pfn = mm_to_dma_pfn(iova_pfn);
3603         last_pfn = start_pfn + nrpages - 1;
3604
3605         if (dev_is_pci(dev))
3606                 pdev = to_pci_dev(dev);
3607
3608         freelist = domain_unmap(domain, start_pfn, last_pfn);
3609         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3610                         !has_iova_flush_queue(&domain->iovad)) {
3611                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3612                                       nrpages, !freelist, 0);
3613                 /* free iova */
3614                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3615                 dma_free_pagelist(freelist);
3616         } else {
3617                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3618                            (unsigned long)freelist);
3619                 /*
3620                  * queue up the release of the unmap to save the 1/6th of the
3621                  * cpu used up by the iotlb flush operation...
3622                  */
3623         }
3624
3625         trace_unmap_single(dev, dev_addr, size);
3626 }
3627
3628 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3629                              size_t size, enum dma_data_direction dir,
3630                              unsigned long attrs)
3631 {
3632         if (iommu_need_mapping(dev))
3633                 intel_unmap(dev, dev_addr, size);
3634         else
3635                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3636 }
3637
3638 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3639                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3640 {
3641         if (iommu_need_mapping(dev))
3642                 intel_unmap(dev, dev_addr, size);
3643 }
3644
3645 static void *intel_alloc_coherent(struct device *dev, size_t size,
3646                                   dma_addr_t *dma_handle, gfp_t flags,
3647                                   unsigned long attrs)
3648 {
3649         struct page *page = NULL;
3650         int order;
3651
3652         if (!iommu_need_mapping(dev))
3653                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3654
3655         size = PAGE_ALIGN(size);
3656         order = get_order(size);
3657
3658         if (gfpflags_allow_blocking(flags)) {
3659                 unsigned int count = size >> PAGE_SHIFT;
3660
3661                 page = dma_alloc_from_contiguous(dev, count, order,
3662                                                  flags & __GFP_NOWARN);
3663         }
3664
3665         if (!page)
3666                 page = alloc_pages(flags, order);
3667         if (!page)
3668                 return NULL;
3669         memset(page_address(page), 0, size);
3670
3671         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3672                                          DMA_BIDIRECTIONAL,
3673                                          dev->coherent_dma_mask);
3674         if (*dma_handle != DMA_MAPPING_ERROR)
3675                 return page_address(page);
3676         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3677                 __free_pages(page, order);
3678
3679         return NULL;
3680 }
3681
3682 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3683                                 dma_addr_t dma_handle, unsigned long attrs)
3684 {
3685         int order;
3686         struct page *page = virt_to_page(vaddr);
3687
3688         if (!iommu_need_mapping(dev))
3689                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3690
3691         size = PAGE_ALIGN(size);
3692         order = get_order(size);
3693
3694         intel_unmap(dev, dma_handle, size);
3695         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3696                 __free_pages(page, order);
3697 }
3698
3699 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3700                            int nelems, enum dma_data_direction dir,
3701                            unsigned long attrs)
3702 {
3703         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3704         unsigned long nrpages = 0;
3705         struct scatterlist *sg;
3706         int i;
3707
3708         if (!iommu_need_mapping(dev))
3709                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3710
3711         for_each_sg(sglist, sg, nelems, i) {
3712                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3713         }
3714
3715         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3716
3717         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3718 }
3719
3720 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3721                         enum dma_data_direction dir, unsigned long attrs)
3722 {
3723         int i;
3724         struct dmar_domain *domain;
3725         size_t size = 0;
3726         int prot = 0;
3727         unsigned long iova_pfn;
3728         int ret;
3729         struct scatterlist *sg;
3730         unsigned long start_vpfn;
3731         struct intel_iommu *iommu;
3732
3733         BUG_ON(dir == DMA_NONE);
3734         if (!iommu_need_mapping(dev))
3735                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3736
3737         domain = deferred_attach_domain(dev);
3738         if (!domain)
3739                 return 0;
3740
3741         iommu = domain_get_iommu(domain);
3742
3743         for_each_sg(sglist, sg, nelems, i)
3744                 size += aligned_nrpages(sg->offset, sg->length);
3745
3746         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3747                                 *dev->dma_mask);
3748         if (!iova_pfn) {
3749                 sglist->dma_length = 0;
3750                 return 0;
3751         }
3752
3753         /*
3754          * Check if DMAR supports zero-length reads on write only
3755          * mappings..
3756          */
3757         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3758                         !cap_zlr(iommu->cap))
3759                 prot |= DMA_PTE_READ;
3760         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3761                 prot |= DMA_PTE_WRITE;
3762
3763         start_vpfn = mm_to_dma_pfn(iova_pfn);
3764
3765         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3766         if (unlikely(ret)) {
3767                 dma_pte_free_pagetable(domain, start_vpfn,
3768                                        start_vpfn + size - 1,
3769                                        agaw_to_level(domain->agaw) + 1);
3770                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3771                 return 0;
3772         }
3773
3774         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3775                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3776
3777         return nelems;
3778 }
3779
3780 static u64 intel_get_required_mask(struct device *dev)
3781 {
3782         if (!iommu_need_mapping(dev))
3783                 return dma_direct_get_required_mask(dev);
3784         return DMA_BIT_MASK(32);
3785 }
3786
3787 static const struct dma_map_ops intel_dma_ops = {
3788         .alloc = intel_alloc_coherent,
3789         .free = intel_free_coherent,
3790         .map_sg = intel_map_sg,
3791         .unmap_sg = intel_unmap_sg,
3792         .map_page = intel_map_page,
3793         .unmap_page = intel_unmap_page,
3794         .map_resource = intel_map_resource,
3795         .unmap_resource = intel_unmap_resource,
3796         .dma_supported = dma_direct_supported,
3797         .mmap = dma_common_mmap,
3798         .get_sgtable = dma_common_get_sgtable,
3799         .get_required_mask = intel_get_required_mask,
3800 };
3801
3802 static void
3803 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3804                    enum dma_data_direction dir, enum dma_sync_target target)
3805 {
3806         struct dmar_domain *domain;
3807         phys_addr_t tlb_addr;
3808
3809         domain = find_domain(dev);
3810         if (WARN_ON(!domain))
3811                 return;
3812
3813         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3814         if (is_swiotlb_buffer(tlb_addr))
3815                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3816 }
3817
3818 static dma_addr_t
3819 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3820                   enum dma_data_direction dir, unsigned long attrs,
3821                   u64 dma_mask)
3822 {
3823         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3824         struct dmar_domain *domain;
3825         struct intel_iommu *iommu;
3826         unsigned long iova_pfn;
3827         unsigned long nrpages;
3828         phys_addr_t tlb_addr;
3829         int prot = 0;
3830         int ret;
3831
3832         domain = deferred_attach_domain(dev);
3833         if (WARN_ON(dir == DMA_NONE || !domain))
3834                 return DMA_MAPPING_ERROR;
3835
3836         iommu = domain_get_iommu(domain);
3837         if (WARN_ON(!iommu))
3838                 return DMA_MAPPING_ERROR;
3839
3840         nrpages = aligned_nrpages(0, size);
3841         iova_pfn = intel_alloc_iova(dev, domain,
3842                                     dma_to_mm_pfn(nrpages), dma_mask);
3843         if (!iova_pfn)
3844                 return DMA_MAPPING_ERROR;
3845
3846         /*
3847          * Check if DMAR supports zero-length reads on write only
3848          * mappings..
3849          */
3850         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3851                         !cap_zlr(iommu->cap))
3852                 prot |= DMA_PTE_READ;
3853         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3854                 prot |= DMA_PTE_WRITE;
3855
3856         /*
3857          * If both the physical buffer start address and size are
3858          * page aligned, we don't need to use a bounce page.
3859          */
3860         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3861                 tlb_addr = swiotlb_tbl_map_single(dev,
3862                                 __phys_to_dma(dev, io_tlb_start),
3863                                 paddr, size, aligned_size, dir, attrs);
3864                 if (tlb_addr == DMA_MAPPING_ERROR) {
3865                         goto swiotlb_error;
3866                 } else {
3867                         /* Cleanup the padding area. */
3868                         void *padding_start = phys_to_virt(tlb_addr);
3869                         size_t padding_size = aligned_size;
3870
3871                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3872                             (dir == DMA_TO_DEVICE ||
3873                              dir == DMA_BIDIRECTIONAL)) {
3874                                 padding_start += size;
3875                                 padding_size -= size;
3876                         }
3877
3878                         memset(padding_start, 0, padding_size);
3879                 }
3880         } else {
3881                 tlb_addr = paddr;
3882         }
3883
3884         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3885                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3886         if (ret)
3887                 goto mapping_error;
3888
3889         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3890
3891         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3892
3893 mapping_error:
3894         if (is_swiotlb_buffer(tlb_addr))
3895                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3896                                          aligned_size, dir, attrs);
3897 swiotlb_error:
3898         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3899         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3900                 size, (unsigned long long)paddr, dir);
3901
3902         return DMA_MAPPING_ERROR;
3903 }
3904
3905 static void
3906 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3907                     enum dma_data_direction dir, unsigned long attrs)
3908 {
3909         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3910         struct dmar_domain *domain;
3911         phys_addr_t tlb_addr;
3912
3913         domain = find_domain(dev);
3914         if (WARN_ON(!domain))
3915                 return;
3916
3917         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3918         if (WARN_ON(!tlb_addr))
3919                 return;
3920
3921         intel_unmap(dev, dev_addr, size);
3922         if (is_swiotlb_buffer(tlb_addr))
3923                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3924                                          aligned_size, dir, attrs);
3925
3926         trace_bounce_unmap_single(dev, dev_addr, size);
3927 }
3928
3929 static dma_addr_t
3930 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3931                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3932 {
3933         return bounce_map_single(dev, page_to_phys(page) + offset,
3934                                  size, dir, attrs, *dev->dma_mask);
3935 }
3936
3937 static dma_addr_t
3938 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3939                     enum dma_data_direction dir, unsigned long attrs)
3940 {
3941         return bounce_map_single(dev, phys_addr, size,
3942                                  dir, attrs, *dev->dma_mask);
3943 }
3944
3945 static void
3946 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3947                   enum dma_data_direction dir, unsigned long attrs)
3948 {
3949         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3950 }
3951
3952 static void
3953 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3954                       enum dma_data_direction dir, unsigned long attrs)
3955 {
3956         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3957 }
3958
3959 static void
3960 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3961                 enum dma_data_direction dir, unsigned long attrs)
3962 {
3963         struct scatterlist *sg;
3964         int i;
3965
3966         for_each_sg(sglist, sg, nelems, i)
3967                 bounce_unmap_page(dev, sg->dma_address,
3968                                   sg_dma_len(sg), dir, attrs);
3969 }
3970
3971 static int
3972 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3973               enum dma_data_direction dir, unsigned long attrs)
3974 {
3975         int i;
3976         struct scatterlist *sg;
3977
3978         for_each_sg(sglist, sg, nelems, i) {
3979                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3980                                                   sg->offset, sg->length,
3981                                                   dir, attrs);
3982                 if (sg->dma_address == DMA_MAPPING_ERROR)
3983                         goto out_unmap;
3984                 sg_dma_len(sg) = sg->length;
3985         }
3986
3987         return nelems;
3988
3989 out_unmap:
3990         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3991         return 0;
3992 }
3993
3994 static void
3995 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3996                            size_t size, enum dma_data_direction dir)
3997 {
3998         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3999 }
4000
4001 static void
4002 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4003                               size_t size, enum dma_data_direction dir)
4004 {
4005         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4006 }
4007
4008 static void
4009 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4010                        int nelems, enum dma_data_direction dir)
4011 {
4012         struct scatterlist *sg;
4013         int i;
4014
4015         for_each_sg(sglist, sg, nelems, i)
4016                 bounce_sync_single(dev, sg_dma_address(sg),
4017                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4018 }
4019
4020 static void
4021 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4022                           int nelems, enum dma_data_direction dir)
4023 {
4024         struct scatterlist *sg;
4025         int i;
4026
4027         for_each_sg(sglist, sg, nelems, i)
4028                 bounce_sync_single(dev, sg_dma_address(sg),
4029                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4030 }
4031
4032 static const struct dma_map_ops bounce_dma_ops = {
4033         .alloc                  = intel_alloc_coherent,
4034         .free                   = intel_free_coherent,
4035         .map_sg                 = bounce_map_sg,
4036         .unmap_sg               = bounce_unmap_sg,
4037         .map_page               = bounce_map_page,
4038         .unmap_page             = bounce_unmap_page,
4039         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4040         .sync_single_for_device = bounce_sync_single_for_device,
4041         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4042         .sync_sg_for_device     = bounce_sync_sg_for_device,
4043         .map_resource           = bounce_map_resource,
4044         .unmap_resource         = bounce_unmap_resource,
4045         .dma_supported          = dma_direct_supported,
4046 };
4047
4048 static inline int iommu_domain_cache_init(void)
4049 {
4050         int ret = 0;
4051
4052         iommu_domain_cache = kmem_cache_create("iommu_domain",
4053                                          sizeof(struct dmar_domain),
4054                                          0,
4055                                          SLAB_HWCACHE_ALIGN,
4056
4057                                          NULL);
4058         if (!iommu_domain_cache) {
4059                 pr_err("Couldn't create iommu_domain cache\n");
4060                 ret = -ENOMEM;
4061         }
4062
4063         return ret;
4064 }
4065
4066 static inline int iommu_devinfo_cache_init(void)
4067 {
4068         int ret = 0;
4069
4070         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4071                                          sizeof(struct device_domain_info),
4072                                          0,
4073                                          SLAB_HWCACHE_ALIGN,
4074                                          NULL);
4075         if (!iommu_devinfo_cache) {
4076                 pr_err("Couldn't create devinfo cache\n");
4077                 ret = -ENOMEM;
4078         }
4079
4080         return ret;
4081 }
4082
4083 static int __init iommu_init_mempool(void)
4084 {
4085         int ret;
4086         ret = iova_cache_get();
4087         if (ret)
4088                 return ret;
4089
4090         ret = iommu_domain_cache_init();
4091         if (ret)
4092                 goto domain_error;
4093
4094         ret = iommu_devinfo_cache_init();
4095         if (!ret)
4096                 return ret;
4097
4098         kmem_cache_destroy(iommu_domain_cache);
4099 domain_error:
4100         iova_cache_put();
4101
4102         return -ENOMEM;
4103 }
4104
4105 static void __init iommu_exit_mempool(void)
4106 {
4107         kmem_cache_destroy(iommu_devinfo_cache);
4108         kmem_cache_destroy(iommu_domain_cache);
4109         iova_cache_put();
4110 }
4111
4112 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4113 {
4114         struct dmar_drhd_unit *drhd;
4115         u32 vtbar;
4116         int rc;
4117
4118         /* We know that this device on this chipset has its own IOMMU.
4119          * If we find it under a different IOMMU, then the BIOS is lying
4120          * to us. Hope that the IOMMU for this device is actually
4121          * disabled, and it needs no translation...
4122          */
4123         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4124         if (rc) {
4125                 /* "can't" happen */
4126                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4127                 return;
4128         }
4129         vtbar &= 0xffff0000;
4130
4131         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4132         drhd = dmar_find_matched_drhd_unit(pdev);
4133         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4134                             TAINT_FIRMWARE_WORKAROUND,
4135                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4136                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4137 }
4138 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4139
4140 static void __init init_no_remapping_devices(void)
4141 {
4142         struct dmar_drhd_unit *drhd;
4143         struct device *dev;
4144         int i;
4145
4146         for_each_drhd_unit(drhd) {
4147                 if (!drhd->include_all) {
4148                         for_each_active_dev_scope(drhd->devices,
4149                                                   drhd->devices_cnt, i, dev)
4150                                 break;
4151                         /* ignore DMAR unit if no devices exist */
4152                         if (i == drhd->devices_cnt)
4153                                 drhd->ignored = 1;
4154                 }
4155         }
4156
4157         for_each_active_drhd_unit(drhd) {
4158                 if (drhd->include_all)
4159                         continue;
4160
4161                 for_each_active_dev_scope(drhd->devices,
4162                                           drhd->devices_cnt, i, dev)
4163                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4164                                 break;
4165                 if (i < drhd->devices_cnt)
4166                         continue;
4167
4168                 /* This IOMMU has *only* gfx devices. Either bypass it or
4169                    set the gfx_mapped flag, as appropriate */
4170                 if (!dmar_map_gfx) {
4171                         drhd->ignored = 1;
4172                         for_each_active_dev_scope(drhd->devices,
4173                                                   drhd->devices_cnt, i, dev)
4174                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4175                 }
4176         }
4177 }
4178
4179 #ifdef CONFIG_SUSPEND
4180 static int init_iommu_hw(void)
4181 {
4182         struct dmar_drhd_unit *drhd;
4183         struct intel_iommu *iommu = NULL;
4184
4185         for_each_active_iommu(iommu, drhd)
4186                 if (iommu->qi)
4187                         dmar_reenable_qi(iommu);
4188
4189         for_each_iommu(iommu, drhd) {
4190                 if (drhd->ignored) {
4191                         /*
4192                          * we always have to disable PMRs or DMA may fail on
4193                          * this device
4194                          */
4195                         if (force_on)
4196                                 iommu_disable_protect_mem_regions(iommu);
4197                         continue;
4198                 }
4199
4200                 iommu_flush_write_buffer(iommu);
4201
4202                 iommu_set_root_entry(iommu);
4203
4204                 iommu->flush.flush_context(iommu, 0, 0, 0,
4205                                            DMA_CCMD_GLOBAL_INVL);
4206                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4207                 iommu_enable_translation(iommu);
4208                 iommu_disable_protect_mem_regions(iommu);
4209         }
4210
4211         return 0;
4212 }
4213
4214 static void iommu_flush_all(void)
4215 {
4216         struct dmar_drhd_unit *drhd;
4217         struct intel_iommu *iommu;
4218
4219         for_each_active_iommu(iommu, drhd) {
4220                 iommu->flush.flush_context(iommu, 0, 0, 0,
4221                                            DMA_CCMD_GLOBAL_INVL);
4222                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4223                                          DMA_TLB_GLOBAL_FLUSH);
4224         }
4225 }
4226
4227 static int iommu_suspend(void)
4228 {
4229         struct dmar_drhd_unit *drhd;
4230         struct intel_iommu *iommu = NULL;
4231         unsigned long flag;
4232
4233         for_each_active_iommu(iommu, drhd) {
4234                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4235                                                  GFP_ATOMIC);
4236                 if (!iommu->iommu_state)
4237                         goto nomem;
4238         }
4239
4240         iommu_flush_all();
4241
4242         for_each_active_iommu(iommu, drhd) {
4243                 iommu_disable_translation(iommu);
4244
4245                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4246
4247                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4248                         readl(iommu->reg + DMAR_FECTL_REG);
4249                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4250                         readl(iommu->reg + DMAR_FEDATA_REG);
4251                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4252                         readl(iommu->reg + DMAR_FEADDR_REG);
4253                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4254                         readl(iommu->reg + DMAR_FEUADDR_REG);
4255
4256                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4257         }
4258         return 0;
4259
4260 nomem:
4261         for_each_active_iommu(iommu, drhd)
4262                 kfree(iommu->iommu_state);
4263
4264         return -ENOMEM;
4265 }
4266
4267 static void iommu_resume(void)
4268 {
4269         struct dmar_drhd_unit *drhd;
4270         struct intel_iommu *iommu = NULL;
4271         unsigned long flag;
4272
4273         if (init_iommu_hw()) {
4274                 if (force_on)
4275                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4276                 else
4277                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4278                 return;
4279         }
4280
4281         for_each_active_iommu(iommu, drhd) {
4282
4283                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4284
4285                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4286                         iommu->reg + DMAR_FECTL_REG);
4287                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4288                         iommu->reg + DMAR_FEDATA_REG);
4289                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4290                         iommu->reg + DMAR_FEADDR_REG);
4291                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4292                         iommu->reg + DMAR_FEUADDR_REG);
4293
4294                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4295         }
4296
4297         for_each_active_iommu(iommu, drhd)
4298                 kfree(iommu->iommu_state);
4299 }
4300
4301 static struct syscore_ops iommu_syscore_ops = {
4302         .resume         = iommu_resume,
4303         .suspend        = iommu_suspend,
4304 };
4305
4306 static void __init init_iommu_pm_ops(void)
4307 {
4308         register_syscore_ops(&iommu_syscore_ops);
4309 }
4310
4311 #else
4312 static inline void init_iommu_pm_ops(void) {}
4313 #endif  /* CONFIG_PM */
4314
4315 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4316 {
4317         struct acpi_dmar_reserved_memory *rmrr;
4318         struct dmar_rmrr_unit *rmrru;
4319         int ret;
4320
4321         rmrr = (struct acpi_dmar_reserved_memory *)header;
4322         ret = arch_rmrr_sanity_check(rmrr);
4323         if (ret)
4324                 return ret;
4325
4326         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4327         if (!rmrru)
4328                 goto out;
4329
4330         rmrru->hdr = header;
4331
4332         rmrru->base_address = rmrr->base_address;
4333         rmrru->end_address = rmrr->end_address;
4334
4335         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4336                                 ((void *)rmrr) + rmrr->header.length,
4337                                 &rmrru->devices_cnt);
4338         if (rmrru->devices_cnt && rmrru->devices == NULL)
4339                 goto free_rmrru;
4340
4341         list_add(&rmrru->list, &dmar_rmrr_units);
4342
4343         return 0;
4344 free_rmrru:
4345         kfree(rmrru);
4346 out:
4347         return -ENOMEM;
4348 }
4349
4350 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4351 {
4352         struct dmar_atsr_unit *atsru;
4353         struct acpi_dmar_atsr *tmp;
4354
4355         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4356                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4357                 if (atsr->segment != tmp->segment)
4358                         continue;
4359                 if (atsr->header.length != tmp->header.length)
4360                         continue;
4361                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4362                         return atsru;
4363         }
4364
4365         return NULL;
4366 }
4367
4368 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4369 {
4370         struct acpi_dmar_atsr *atsr;
4371         struct dmar_atsr_unit *atsru;
4372
4373         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4374                 return 0;
4375
4376         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4377         atsru = dmar_find_atsr(atsr);
4378         if (atsru)
4379                 return 0;
4380
4381         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4382         if (!atsru)
4383                 return -ENOMEM;
4384
4385         /*
4386          * If memory is allocated from slab by ACPI _DSM method, we need to
4387          * copy the memory content because the memory buffer will be freed
4388          * on return.
4389          */
4390         atsru->hdr = (void *)(atsru + 1);
4391         memcpy(atsru->hdr, hdr, hdr->length);
4392         atsru->include_all = atsr->flags & 0x1;
4393         if (!atsru->include_all) {
4394                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4395                                 (void *)atsr + atsr->header.length,
4396                                 &atsru->devices_cnt);
4397                 if (atsru->devices_cnt && atsru->devices == NULL) {
4398                         kfree(atsru);
4399                         return -ENOMEM;
4400                 }
4401         }
4402
4403         list_add_rcu(&atsru->list, &dmar_atsr_units);
4404
4405         return 0;
4406 }
4407
4408 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4409 {
4410         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4411         kfree(atsru);
4412 }
4413
4414 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4415 {
4416         struct acpi_dmar_atsr *atsr;
4417         struct dmar_atsr_unit *atsru;
4418
4419         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4420         atsru = dmar_find_atsr(atsr);
4421         if (atsru) {
4422                 list_del_rcu(&atsru->list);
4423                 synchronize_rcu();
4424                 intel_iommu_free_atsr(atsru);
4425         }
4426
4427         return 0;
4428 }
4429
4430 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4431 {
4432         int i;
4433         struct device *dev;
4434         struct acpi_dmar_atsr *atsr;
4435         struct dmar_atsr_unit *atsru;
4436
4437         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4438         atsru = dmar_find_atsr(atsr);
4439         if (!atsru)
4440                 return 0;
4441
4442         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4443                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4444                                           i, dev)
4445                         return -EBUSY;
4446         }
4447
4448         return 0;
4449 }
4450
4451 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4452 {
4453         int sp, ret;
4454         struct intel_iommu *iommu = dmaru->iommu;
4455
4456         if (g_iommus[iommu->seq_id])
4457                 return 0;
4458
4459         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4460                 pr_warn("%s: Doesn't support hardware pass through.\n",
4461                         iommu->name);
4462                 return -ENXIO;
4463         }
4464         if (!ecap_sc_support(iommu->ecap) &&
4465             domain_update_iommu_snooping(iommu)) {
4466                 pr_warn("%s: Doesn't support snooping.\n",
4467                         iommu->name);
4468                 return -ENXIO;
4469         }
4470         sp = domain_update_iommu_superpage(iommu) - 1;
4471         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4472                 pr_warn("%s: Doesn't support large page.\n",
4473                         iommu->name);
4474                 return -ENXIO;
4475         }
4476
4477         /*
4478          * Disable translation if already enabled prior to OS handover.
4479          */
4480         if (iommu->gcmd & DMA_GCMD_TE)
4481                 iommu_disable_translation(iommu);
4482
4483         g_iommus[iommu->seq_id] = iommu;
4484         ret = iommu_init_domains(iommu);
4485         if (ret == 0)
4486                 ret = iommu_alloc_root_entry(iommu);
4487         if (ret)
4488                 goto out;
4489
4490 #ifdef CONFIG_INTEL_IOMMU_SVM
4491         if (pasid_supported(iommu))
4492                 intel_svm_init(iommu);
4493 #endif
4494
4495         if (dmaru->ignored) {
4496                 /*
4497                  * we always have to disable PMRs or DMA may fail on this device
4498                  */
4499                 if (force_on)
4500                         iommu_disable_protect_mem_regions(iommu);
4501                 return 0;
4502         }
4503
4504         intel_iommu_init_qi(iommu);
4505         iommu_flush_write_buffer(iommu);
4506
4507 #ifdef CONFIG_INTEL_IOMMU_SVM
4508         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4509                 ret = intel_svm_enable_prq(iommu);
4510                 if (ret)
4511                         goto disable_iommu;
4512         }
4513 #endif
4514         ret = dmar_set_interrupt(iommu);
4515         if (ret)
4516                 goto disable_iommu;
4517
4518         iommu_set_root_entry(iommu);
4519         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4520         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4521         iommu_enable_translation(iommu);
4522
4523         iommu_disable_protect_mem_regions(iommu);
4524         return 0;
4525
4526 disable_iommu:
4527         disable_dmar_iommu(iommu);
4528 out:
4529         free_dmar_iommu(iommu);
4530         return ret;
4531 }
4532
4533 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4534 {
4535         int ret = 0;
4536         struct intel_iommu *iommu = dmaru->iommu;
4537
4538         if (!intel_iommu_enabled)
4539                 return 0;
4540         if (iommu == NULL)
4541                 return -EINVAL;
4542
4543         if (insert) {
4544                 ret = intel_iommu_add(dmaru);
4545         } else {
4546                 disable_dmar_iommu(iommu);
4547                 free_dmar_iommu(iommu);
4548         }
4549
4550         return ret;
4551 }
4552
4553 static void intel_iommu_free_dmars(void)
4554 {
4555         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4556         struct dmar_atsr_unit *atsru, *atsr_n;
4557
4558         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4559                 list_del(&rmrru->list);
4560                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4561                 kfree(rmrru);
4562         }
4563
4564         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4565                 list_del(&atsru->list);
4566                 intel_iommu_free_atsr(atsru);
4567         }
4568 }
4569
4570 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4571 {
4572         int i, ret = 1;
4573         struct pci_bus *bus;
4574         struct pci_dev *bridge = NULL;
4575         struct device *tmp;
4576         struct acpi_dmar_atsr *atsr;
4577         struct dmar_atsr_unit *atsru;
4578
4579         dev = pci_physfn(dev);
4580         for (bus = dev->bus; bus; bus = bus->parent) {
4581                 bridge = bus->self;
4582                 /* If it's an integrated device, allow ATS */
4583                 if (!bridge)
4584                         return 1;
4585                 /* Connected via non-PCIe: no ATS */
4586                 if (!pci_is_pcie(bridge) ||
4587                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4588                         return 0;
4589                 /* If we found the root port, look it up in the ATSR */
4590                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4591                         break;
4592         }
4593
4594         rcu_read_lock();
4595         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4596                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4597                 if (atsr->segment != pci_domain_nr(dev->bus))
4598                         continue;
4599
4600                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4601                         if (tmp == &bridge->dev)
4602                                 goto out;
4603
4604                 if (atsru->include_all)
4605                         goto out;
4606         }
4607         ret = 0;
4608 out:
4609         rcu_read_unlock();
4610
4611         return ret;
4612 }
4613
4614 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4615 {
4616         int ret;
4617         struct dmar_rmrr_unit *rmrru;
4618         struct dmar_atsr_unit *atsru;
4619         struct acpi_dmar_atsr *atsr;
4620         struct acpi_dmar_reserved_memory *rmrr;
4621
4622         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4623                 return 0;
4624
4625         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4626                 rmrr = container_of(rmrru->hdr,
4627                                     struct acpi_dmar_reserved_memory, header);
4628                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4629                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4630                                 ((void *)rmrr) + rmrr->header.length,
4631                                 rmrr->segment, rmrru->devices,
4632                                 rmrru->devices_cnt);
4633                         if (ret < 0)
4634                                 return ret;
4635                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4636                         dmar_remove_dev_scope(info, rmrr->segment,
4637                                 rmrru->devices, rmrru->devices_cnt);
4638                 }
4639         }
4640
4641         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4642                 if (atsru->include_all)
4643                         continue;
4644
4645                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4646                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4647                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4648                                         (void *)atsr + atsr->header.length,
4649                                         atsr->segment, atsru->devices,
4650                                         atsru->devices_cnt);
4651                         if (ret > 0)
4652                                 break;
4653                         else if (ret < 0)
4654                                 return ret;
4655                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4656                         if (dmar_remove_dev_scope(info, atsr->segment,
4657                                         atsru->devices, atsru->devices_cnt))
4658                                 break;
4659                 }
4660         }
4661
4662         return 0;
4663 }
4664
4665 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4666                                        unsigned long val, void *v)
4667 {
4668         struct memory_notify *mhp = v;
4669         unsigned long long start, end;
4670         unsigned long start_vpfn, last_vpfn;
4671
4672         switch (val) {
4673         case MEM_GOING_ONLINE:
4674                 start = mhp->start_pfn << PAGE_SHIFT;
4675                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4676                 if (iommu_domain_identity_map(si_domain, start, end)) {
4677                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4678                                 start, end);
4679                         return NOTIFY_BAD;
4680                 }
4681                 break;
4682
4683         case MEM_OFFLINE:
4684         case MEM_CANCEL_ONLINE:
4685                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4686                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4687                 while (start_vpfn <= last_vpfn) {
4688                         struct iova *iova;
4689                         struct dmar_drhd_unit *drhd;
4690                         struct intel_iommu *iommu;
4691                         struct page *freelist;
4692
4693                         iova = find_iova(&si_domain->iovad, start_vpfn);
4694                         if (iova == NULL) {
4695                                 pr_debug("Failed get IOVA for PFN %lx\n",
4696                                          start_vpfn);
4697                                 break;
4698                         }
4699
4700                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4701                                                      start_vpfn, last_vpfn);
4702                         if (iova == NULL) {
4703                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4704                                         start_vpfn, last_vpfn);
4705                                 return NOTIFY_BAD;
4706                         }
4707
4708                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4709                                                iova->pfn_hi);
4710
4711                         rcu_read_lock();
4712                         for_each_active_iommu(iommu, drhd)
4713                                 iommu_flush_iotlb_psi(iommu, si_domain,
4714                                         iova->pfn_lo, iova_size(iova),
4715                                         !freelist, 0);
4716                         rcu_read_unlock();
4717                         dma_free_pagelist(freelist);
4718
4719                         start_vpfn = iova->pfn_hi + 1;
4720                         free_iova_mem(iova);
4721                 }
4722                 break;
4723         }
4724
4725         return NOTIFY_OK;
4726 }
4727
4728 static struct notifier_block intel_iommu_memory_nb = {
4729         .notifier_call = intel_iommu_memory_notifier,
4730         .priority = 0
4731 };
4732
4733 static void free_all_cpu_cached_iovas(unsigned int cpu)
4734 {
4735         int i;
4736
4737         for (i = 0; i < g_num_of_iommus; i++) {
4738                 struct intel_iommu *iommu = g_iommus[i];
4739                 struct dmar_domain *domain;
4740                 int did;
4741
4742                 if (!iommu)
4743                         continue;
4744
4745                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4746                         domain = get_iommu_domain(iommu, (u16)did);
4747
4748                         if (!domain)
4749                                 continue;
4750                         free_cpu_cached_iovas(cpu, &domain->iovad);
4751                 }
4752         }
4753 }
4754
4755 static int intel_iommu_cpu_dead(unsigned int cpu)
4756 {
4757         free_all_cpu_cached_iovas(cpu);
4758         return 0;
4759 }
4760
4761 static void intel_disable_iommus(void)
4762 {
4763         struct intel_iommu *iommu = NULL;
4764         struct dmar_drhd_unit *drhd;
4765
4766         for_each_iommu(iommu, drhd)
4767                 iommu_disable_translation(iommu);
4768 }
4769
4770 void intel_iommu_shutdown(void)
4771 {
4772         struct dmar_drhd_unit *drhd;
4773         struct intel_iommu *iommu = NULL;
4774
4775         if (no_iommu || dmar_disabled)
4776                 return;
4777
4778         down_write(&dmar_global_lock);
4779
4780         /* Disable PMRs explicitly here. */
4781         for_each_iommu(iommu, drhd)
4782                 iommu_disable_protect_mem_regions(iommu);
4783
4784         /* Make sure the IOMMUs are switched off */
4785         intel_disable_iommus();
4786
4787         up_write(&dmar_global_lock);
4788 }
4789
4790 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4791 {
4792         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4793
4794         return container_of(iommu_dev, struct intel_iommu, iommu);
4795 }
4796
4797 static ssize_t intel_iommu_show_version(struct device *dev,
4798                                         struct device_attribute *attr,
4799                                         char *buf)
4800 {
4801         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4802         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4803         return sprintf(buf, "%d:%d\n",
4804                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4805 }
4806 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4807
4808 static ssize_t intel_iommu_show_address(struct device *dev,
4809                                         struct device_attribute *attr,
4810                                         char *buf)
4811 {
4812         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4813         return sprintf(buf, "%llx\n", iommu->reg_phys);
4814 }
4815 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4816
4817 static ssize_t intel_iommu_show_cap(struct device *dev,
4818                                     struct device_attribute *attr,
4819                                     char *buf)
4820 {
4821         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4822         return sprintf(buf, "%llx\n", iommu->cap);
4823 }
4824 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4825
4826 static ssize_t intel_iommu_show_ecap(struct device *dev,
4827                                     struct device_attribute *attr,
4828                                     char *buf)
4829 {
4830         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4831         return sprintf(buf, "%llx\n", iommu->ecap);
4832 }
4833 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4834
4835 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4836                                       struct device_attribute *attr,
4837                                       char *buf)
4838 {
4839         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4840         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4841 }
4842 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4843
4844 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4845                                            struct device_attribute *attr,
4846                                            char *buf)
4847 {
4848         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4849         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4850                                                   cap_ndoms(iommu->cap)));
4851 }
4852 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4853
4854 static struct attribute *intel_iommu_attrs[] = {
4855         &dev_attr_version.attr,
4856         &dev_attr_address.attr,
4857         &dev_attr_cap.attr,
4858         &dev_attr_ecap.attr,
4859         &dev_attr_domains_supported.attr,
4860         &dev_attr_domains_used.attr,
4861         NULL,
4862 };
4863
4864 static struct attribute_group intel_iommu_group = {
4865         .name = "intel-iommu",
4866         .attrs = intel_iommu_attrs,
4867 };
4868
4869 const struct attribute_group *intel_iommu_groups[] = {
4870         &intel_iommu_group,
4871         NULL,
4872 };
4873
4874 static inline bool has_untrusted_dev(void)
4875 {
4876         struct pci_dev *pdev = NULL;
4877
4878         for_each_pci_dev(pdev)
4879                 if (pdev->untrusted)
4880                         return true;
4881
4882         return false;
4883 }
4884
4885 static int __init platform_optin_force_iommu(void)
4886 {
4887         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4888                 return 0;
4889
4890         if (no_iommu || dmar_disabled)
4891                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4892
4893         /*
4894          * If Intel-IOMMU is disabled by default, we will apply identity
4895          * map for all devices except those marked as being untrusted.
4896          */
4897         if (dmar_disabled)
4898                 iommu_identity_mapping |= IDENTMAP_ALL;
4899
4900         dmar_disabled = 0;
4901         no_iommu = 0;
4902
4903         return 1;
4904 }
4905
4906 static int __init probe_acpi_namespace_devices(void)
4907 {
4908         struct dmar_drhd_unit *drhd;
4909         /* To avoid a -Wunused-but-set-variable warning. */
4910         struct intel_iommu *iommu __maybe_unused;
4911         struct device *dev;
4912         int i, ret = 0;
4913
4914         for_each_active_iommu(iommu, drhd) {
4915                 for_each_active_dev_scope(drhd->devices,
4916                                           drhd->devices_cnt, i, dev) {
4917                         struct acpi_device_physical_node *pn;
4918                         struct iommu_group *group;
4919                         struct acpi_device *adev;
4920
4921                         if (dev->bus != &acpi_bus_type)
4922                                 continue;
4923
4924                         adev = to_acpi_device(dev);
4925                         mutex_lock(&adev->physical_node_lock);
4926                         list_for_each_entry(pn,
4927                                             &adev->physical_node_list, node) {
4928                                 group = iommu_group_get(pn->dev);
4929                                 if (group) {
4930                                         iommu_group_put(group);
4931                                         continue;
4932                                 }
4933
4934                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4935                                 ret = iommu_probe_device(pn->dev);
4936                                 if (ret)
4937                                         break;
4938                         }
4939                         mutex_unlock(&adev->physical_node_lock);
4940
4941                         if (ret)
4942                                 return ret;
4943                 }
4944         }
4945
4946         return 0;
4947 }
4948
4949 int __init intel_iommu_init(void)
4950 {
4951         int ret = -ENODEV;
4952         struct dmar_drhd_unit *drhd;
4953         struct intel_iommu *iommu;
4954
4955         /*
4956          * Intel IOMMU is required for a TXT/tboot launch or platform
4957          * opt in, so enforce that.
4958          */
4959         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4960
4961         if (iommu_init_mempool()) {
4962                 if (force_on)
4963                         panic("tboot: Failed to initialize iommu memory\n");
4964                 return -ENOMEM;
4965         }
4966
4967         down_write(&dmar_global_lock);
4968         if (dmar_table_init()) {
4969                 if (force_on)
4970                         panic("tboot: Failed to initialize DMAR table\n");
4971                 goto out_free_dmar;
4972         }
4973
4974         if (dmar_dev_scope_init() < 0) {
4975                 if (force_on)
4976                         panic("tboot: Failed to initialize DMAR device scope\n");
4977                 goto out_free_dmar;
4978         }
4979
4980         up_write(&dmar_global_lock);
4981
4982         /*
4983          * The bus notifier takes the dmar_global_lock, so lockdep will
4984          * complain later when we register it under the lock.
4985          */
4986         dmar_register_bus_notifier();
4987
4988         down_write(&dmar_global_lock);
4989
4990         if (no_iommu || dmar_disabled) {
4991                 /*
4992                  * We exit the function here to ensure IOMMU's remapping and
4993                  * mempool aren't setup, which means that the IOMMU's PMRs
4994                  * won't be disabled via the call to init_dmars(). So disable
4995                  * it explicitly here. The PMRs were setup by tboot prior to
4996                  * calling SENTER, but the kernel is expected to reset/tear
4997                  * down the PMRs.
4998                  */
4999                 if (intel_iommu_tboot_noforce) {
5000                         for_each_iommu(iommu, drhd)
5001                                 iommu_disable_protect_mem_regions(iommu);
5002                 }
5003
5004                 /*
5005                  * Make sure the IOMMUs are switched off, even when we
5006                  * boot into a kexec kernel and the previous kernel left
5007                  * them enabled
5008                  */
5009                 intel_disable_iommus();
5010                 goto out_free_dmar;
5011         }
5012
5013         if (list_empty(&dmar_rmrr_units))
5014                 pr_info("No RMRR found\n");
5015
5016         if (list_empty(&dmar_atsr_units))
5017                 pr_info("No ATSR found\n");
5018
5019         if (dmar_init_reserved_ranges()) {
5020                 if (force_on)
5021                         panic("tboot: Failed to reserve iommu ranges\n");
5022                 goto out_free_reserved_range;
5023         }
5024
5025         if (dmar_map_gfx)
5026                 intel_iommu_gfx_mapped = 1;
5027
5028         init_no_remapping_devices();
5029
5030         ret = init_dmars();
5031         if (ret) {
5032                 if (force_on)
5033                         panic("tboot: Failed to initialize DMARs\n");
5034                 pr_err("Initialization failed\n");
5035                 goto out_free_reserved_range;
5036         }
5037         up_write(&dmar_global_lock);
5038
5039 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5040         /*
5041          * If the system has no untrusted device or the user has decided
5042          * to disable the bounce page mechanisms, we don't need swiotlb.
5043          * Mark this and the pre-allocated bounce pages will be released
5044          * later.
5045          */
5046         if (!has_untrusted_dev() || intel_no_bounce)
5047                 swiotlb = 0;
5048 #endif
5049         dma_ops = &intel_dma_ops;
5050
5051         init_iommu_pm_ops();
5052
5053         for_each_active_iommu(iommu, drhd) {
5054                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5055                                        intel_iommu_groups,
5056                                        "%s", iommu->name);
5057                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5058                 iommu_device_register(&iommu->iommu);
5059         }
5060
5061         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5062         if (si_domain && !hw_pass_through)
5063                 register_memory_notifier(&intel_iommu_memory_nb);
5064         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5065                           intel_iommu_cpu_dead);
5066
5067         down_read(&dmar_global_lock);
5068         if (probe_acpi_namespace_devices())
5069                 pr_warn("ACPI name space devices didn't probe correctly\n");
5070         up_read(&dmar_global_lock);
5071
5072         /* Finally, we enable the DMA remapping hardware. */
5073         for_each_iommu(iommu, drhd) {
5074                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5075                         iommu_enable_translation(iommu);
5076
5077                 iommu_disable_protect_mem_regions(iommu);
5078         }
5079         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5080
5081         intel_iommu_enabled = 1;
5082         intel_iommu_debugfs_init();
5083
5084         return 0;
5085
5086 out_free_reserved_range:
5087         put_iova_domain(&reserved_iova_list);
5088 out_free_dmar:
5089         intel_iommu_free_dmars();
5090         up_write(&dmar_global_lock);
5091         iommu_exit_mempool();
5092         return ret;
5093 }
5094
5095 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5096 {
5097         struct intel_iommu *iommu = opaque;
5098
5099         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5100         return 0;
5101 }
5102
5103 /*
5104  * NB - intel-iommu lacks any sort of reference counting for the users of
5105  * dependent devices.  If multiple endpoints have intersecting dependent
5106  * devices, unbinding the driver from any one of them will possibly leave
5107  * the others unable to operate.
5108  */
5109 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5110 {
5111         if (!iommu || !dev || !dev_is_pci(dev))
5112                 return;
5113
5114         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5115 }
5116
5117 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5118 {
5119         struct dmar_domain *domain;
5120         struct intel_iommu *iommu;
5121         unsigned long flags;
5122
5123         assert_spin_locked(&device_domain_lock);
5124
5125         if (WARN_ON(!info))
5126                 return;
5127
5128         iommu = info->iommu;
5129         domain = info->domain;
5130
5131         if (info->dev) {
5132                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5133                         intel_pasid_tear_down_entry(iommu, info->dev,
5134                                         PASID_RID2PASID);
5135
5136                 iommu_disable_dev_iotlb(info);
5137                 domain_context_clear(iommu, info->dev);
5138                 intel_pasid_free_table(info->dev);
5139         }
5140
5141         unlink_domain_info(info);
5142
5143         spin_lock_irqsave(&iommu->lock, flags);
5144         domain_detach_iommu(domain, iommu);
5145         spin_unlock_irqrestore(&iommu->lock, flags);
5146
5147         /* free the private domain */
5148         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5149             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5150             list_empty(&domain->devices))
5151                 domain_exit(info->domain);
5152
5153         free_devinfo_mem(info);
5154 }
5155
5156 static void dmar_remove_one_dev_info(struct device *dev)
5157 {
5158         struct device_domain_info *info;
5159         unsigned long flags;
5160
5161         spin_lock_irqsave(&device_domain_lock, flags);
5162         info = dev->archdata.iommu;
5163         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5164             && info != DUMMY_DEVICE_DOMAIN_INFO)
5165                 __dmar_remove_one_dev_info(info);
5166         spin_unlock_irqrestore(&device_domain_lock, flags);
5167 }
5168
5169 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5170 {
5171         int adjust_width;
5172
5173         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5174         domain_reserve_special_ranges(domain);
5175
5176         /* calculate AGAW */
5177         domain->gaw = guest_width;
5178         adjust_width = guestwidth_to_adjustwidth(guest_width);
5179         domain->agaw = width_to_agaw(adjust_width);
5180
5181         domain->iommu_coherency = 0;
5182         domain->iommu_snooping = 0;
5183         domain->iommu_superpage = 0;
5184         domain->max_addr = 0;
5185
5186         /* always allocate the top pgd */
5187         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5188         if (!domain->pgd)
5189                 return -ENOMEM;
5190         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5191         return 0;
5192 }
5193
5194 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5195 {
5196         struct dmar_domain *dmar_domain;
5197         struct iommu_domain *domain;
5198
5199         switch (type) {
5200         case IOMMU_DOMAIN_DMA:
5201         /* fallthrough */
5202         case IOMMU_DOMAIN_UNMANAGED:
5203                 dmar_domain = alloc_domain(0);
5204                 if (!dmar_domain) {
5205                         pr_err("Can't allocate dmar_domain\n");
5206                         return NULL;
5207                 }
5208                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5209                         pr_err("Domain initialization failed\n");
5210                         domain_exit(dmar_domain);
5211                         return NULL;
5212                 }
5213
5214                 if (type == IOMMU_DOMAIN_DMA &&
5215                     init_iova_flush_queue(&dmar_domain->iovad,
5216                                           iommu_flush_iova, iova_entry_free)) {
5217                         pr_warn("iova flush queue initialization failed\n");
5218                         intel_iommu_strict = 1;
5219                 }
5220
5221                 domain_update_iommu_cap(dmar_domain);
5222
5223                 domain = &dmar_domain->domain;
5224                 domain->geometry.aperture_start = 0;
5225                 domain->geometry.aperture_end   =
5226                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5227                 domain->geometry.force_aperture = true;
5228
5229                 return domain;
5230         case IOMMU_DOMAIN_IDENTITY:
5231                 return &si_domain->domain;
5232         default:
5233                 return NULL;
5234         }
5235
5236         return NULL;
5237 }
5238
5239 static void intel_iommu_domain_free(struct iommu_domain *domain)
5240 {
5241         if (domain != &si_domain->domain)
5242                 domain_exit(to_dmar_domain(domain));
5243 }
5244
5245 /*
5246  * Check whether a @domain could be attached to the @dev through the
5247  * aux-domain attach/detach APIs.
5248  */
5249 static inline bool
5250 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5251 {
5252         struct device_domain_info *info = dev->archdata.iommu;
5253
5254         return info && info->auxd_enabled &&
5255                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5256 }
5257
5258 static void auxiliary_link_device(struct dmar_domain *domain,
5259                                   struct device *dev)
5260 {
5261         struct device_domain_info *info = dev->archdata.iommu;
5262
5263         assert_spin_locked(&device_domain_lock);
5264         if (WARN_ON(!info))
5265                 return;
5266
5267         domain->auxd_refcnt++;
5268         list_add(&domain->auxd, &info->auxiliary_domains);
5269 }
5270
5271 static void auxiliary_unlink_device(struct dmar_domain *domain,
5272                                     struct device *dev)
5273 {
5274         struct device_domain_info *info = dev->archdata.iommu;
5275
5276         assert_spin_locked(&device_domain_lock);
5277         if (WARN_ON(!info))
5278                 return;
5279
5280         list_del(&domain->auxd);
5281         domain->auxd_refcnt--;
5282
5283         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5284                 intel_pasid_free_id(domain->default_pasid);
5285 }
5286
5287 static int aux_domain_add_dev(struct dmar_domain *domain,
5288                               struct device *dev)
5289 {
5290         int ret;
5291         u8 bus, devfn;
5292         unsigned long flags;
5293         struct intel_iommu *iommu;
5294
5295         iommu = device_to_iommu(dev, &bus, &devfn);
5296         if (!iommu)
5297                 return -ENODEV;
5298
5299         if (domain->default_pasid <= 0) {
5300                 int pasid;
5301
5302                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5303                                              pci_max_pasids(to_pci_dev(dev)),
5304                                              GFP_KERNEL);
5305                 if (pasid <= 0) {
5306                         pr_err("Can't allocate default pasid\n");
5307                         return -ENODEV;
5308                 }
5309                 domain->default_pasid = pasid;
5310         }
5311
5312         spin_lock_irqsave(&device_domain_lock, flags);
5313         /*
5314          * iommu->lock must be held to attach domain to iommu and setup the
5315          * pasid entry for second level translation.
5316          */
5317         spin_lock(&iommu->lock);
5318         ret = domain_attach_iommu(domain, iommu);
5319         if (ret)
5320                 goto attach_failed;
5321
5322         /* Setup the PASID entry for mediated devices: */
5323         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5324                                              domain->default_pasid);
5325         if (ret)
5326                 goto table_failed;
5327         spin_unlock(&iommu->lock);
5328
5329         auxiliary_link_device(domain, dev);
5330
5331         spin_unlock_irqrestore(&device_domain_lock, flags);
5332
5333         return 0;
5334
5335 table_failed:
5336         domain_detach_iommu(domain, iommu);
5337 attach_failed:
5338         spin_unlock(&iommu->lock);
5339         spin_unlock_irqrestore(&device_domain_lock, flags);
5340         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5341                 intel_pasid_free_id(domain->default_pasid);
5342
5343         return ret;
5344 }
5345
5346 static void aux_domain_remove_dev(struct dmar_domain *domain,
5347                                   struct device *dev)
5348 {
5349         struct device_domain_info *info;
5350         struct intel_iommu *iommu;
5351         unsigned long flags;
5352
5353         if (!is_aux_domain(dev, &domain->domain))
5354                 return;
5355
5356         spin_lock_irqsave(&device_domain_lock, flags);
5357         info = dev->archdata.iommu;
5358         iommu = info->iommu;
5359
5360         auxiliary_unlink_device(domain, dev);
5361
5362         spin_lock(&iommu->lock);
5363         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5364         domain_detach_iommu(domain, iommu);
5365         spin_unlock(&iommu->lock);
5366
5367         spin_unlock_irqrestore(&device_domain_lock, flags);
5368 }
5369
5370 static int prepare_domain_attach_device(struct iommu_domain *domain,
5371                                         struct device *dev)
5372 {
5373         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5374         struct intel_iommu *iommu;
5375         int addr_width;
5376         u8 bus, devfn;
5377
5378         iommu = device_to_iommu(dev, &bus, &devfn);
5379         if (!iommu)
5380                 return -ENODEV;
5381
5382         /* check if this iommu agaw is sufficient for max mapped address */
5383         addr_width = agaw_to_width(iommu->agaw);
5384         if (addr_width > cap_mgaw(iommu->cap))
5385                 addr_width = cap_mgaw(iommu->cap);
5386
5387         if (dmar_domain->max_addr > (1LL << addr_width)) {
5388                 dev_err(dev, "%s: iommu width (%d) is not "
5389                         "sufficient for the mapped address (%llx)\n",
5390                         __func__, addr_width, dmar_domain->max_addr);
5391                 return -EFAULT;
5392         }
5393         dmar_domain->gaw = addr_width;
5394
5395         /*
5396          * Knock out extra levels of page tables if necessary
5397          */
5398         while (iommu->agaw < dmar_domain->agaw) {
5399                 struct dma_pte *pte;
5400
5401                 pte = dmar_domain->pgd;
5402                 if (dma_pte_present(pte)) {
5403                         dmar_domain->pgd = (struct dma_pte *)
5404                                 phys_to_virt(dma_pte_addr(pte));
5405                         free_pgtable_page(pte);
5406                 }
5407                 dmar_domain->agaw--;
5408         }
5409
5410         return 0;
5411 }
5412
5413 static int intel_iommu_attach_device(struct iommu_domain *domain,
5414                                      struct device *dev)
5415 {
5416         int ret;
5417
5418         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5419             device_is_rmrr_locked(dev)) {
5420                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5421                 return -EPERM;
5422         }
5423
5424         if (is_aux_domain(dev, domain))
5425                 return -EPERM;
5426
5427         /* normally dev is not mapped */
5428         if (unlikely(domain_context_mapped(dev))) {
5429                 struct dmar_domain *old_domain;
5430
5431                 old_domain = find_domain(dev);
5432                 if (old_domain)
5433                         dmar_remove_one_dev_info(dev);
5434         }
5435
5436         ret = prepare_domain_attach_device(domain, dev);
5437         if (ret)
5438                 return ret;
5439
5440         return domain_add_dev_info(to_dmar_domain(domain), dev);
5441 }
5442
5443 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5444                                          struct device *dev)
5445 {
5446         int ret;
5447
5448         if (!is_aux_domain(dev, domain))
5449                 return -EPERM;
5450
5451         ret = prepare_domain_attach_device(domain, dev);
5452         if (ret)
5453                 return ret;
5454
5455         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5456 }
5457
5458 static void intel_iommu_detach_device(struct iommu_domain *domain,
5459                                       struct device *dev)
5460 {
5461         dmar_remove_one_dev_info(dev);
5462 }
5463
5464 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5465                                           struct device *dev)
5466 {
5467         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5468 }
5469
5470 static int intel_iommu_map(struct iommu_domain *domain,
5471                            unsigned long iova, phys_addr_t hpa,
5472                            size_t size, int iommu_prot, gfp_t gfp)
5473 {
5474         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5475         u64 max_addr;
5476         int prot = 0;
5477         int ret;
5478
5479         if (iommu_prot & IOMMU_READ)
5480                 prot |= DMA_PTE_READ;
5481         if (iommu_prot & IOMMU_WRITE)
5482                 prot |= DMA_PTE_WRITE;
5483         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5484                 prot |= DMA_PTE_SNP;
5485
5486         max_addr = iova + size;
5487         if (dmar_domain->max_addr < max_addr) {
5488                 u64 end;
5489
5490                 /* check if minimum agaw is sufficient for mapped address */
5491                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5492                 if (end < max_addr) {
5493                         pr_err("%s: iommu width (%d) is not "
5494                                "sufficient for the mapped address (%llx)\n",
5495                                __func__, dmar_domain->gaw, max_addr);
5496                         return -EFAULT;
5497                 }
5498                 dmar_domain->max_addr = max_addr;
5499         }
5500         /* Round up size to next multiple of PAGE_SIZE, if it and
5501            the low bits of hpa would take us onto the next page */
5502         size = aligned_nrpages(hpa, size);
5503         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5504                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5505         return ret;
5506 }
5507
5508 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5509                                 unsigned long iova, size_t size,
5510                                 struct iommu_iotlb_gather *gather)
5511 {
5512         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5513         struct page *freelist = NULL;
5514         unsigned long start_pfn, last_pfn;
5515         unsigned int npages;
5516         int iommu_id, level = 0;
5517
5518         /* Cope with horrid API which requires us to unmap more than the
5519            size argument if it happens to be a large-page mapping. */
5520         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5521
5522         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5523                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5524
5525         start_pfn = iova >> VTD_PAGE_SHIFT;
5526         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5527
5528         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5529
5530         npages = last_pfn - start_pfn + 1;
5531
5532         for_each_domain_iommu(iommu_id, dmar_domain)
5533                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5534                                       start_pfn, npages, !freelist, 0);
5535
5536         dma_free_pagelist(freelist);
5537
5538         if (dmar_domain->max_addr == iova + size)
5539                 dmar_domain->max_addr = iova;
5540
5541         return size;
5542 }
5543
5544 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5545                                             dma_addr_t iova)
5546 {
5547         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5548         struct dma_pte *pte;
5549         int level = 0;
5550         u64 phys = 0;
5551
5552         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5553         if (pte)
5554                 phys = dma_pte_addr(pte);
5555
5556         return phys;
5557 }
5558
5559 static inline bool scalable_mode_support(void)
5560 {
5561         struct dmar_drhd_unit *drhd;
5562         struct intel_iommu *iommu;
5563         bool ret = true;
5564
5565         rcu_read_lock();
5566         for_each_active_iommu(iommu, drhd) {
5567                 if (!sm_supported(iommu)) {
5568                         ret = false;
5569                         break;
5570                 }
5571         }
5572         rcu_read_unlock();
5573
5574         return ret;
5575 }
5576
5577 static inline bool iommu_pasid_support(void)
5578 {
5579         struct dmar_drhd_unit *drhd;
5580         struct intel_iommu *iommu;
5581         bool ret = true;
5582
5583         rcu_read_lock();
5584         for_each_active_iommu(iommu, drhd) {
5585                 if (!pasid_supported(iommu)) {
5586                         ret = false;
5587                         break;
5588                 }
5589         }
5590         rcu_read_unlock();
5591
5592         return ret;
5593 }
5594
5595 static bool intel_iommu_capable(enum iommu_cap cap)
5596 {
5597         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5598                 return domain_update_iommu_snooping(NULL) == 1;
5599         if (cap == IOMMU_CAP_INTR_REMAP)
5600                 return irq_remapping_enabled == 1;
5601
5602         return false;
5603 }
5604
5605 static int intel_iommu_add_device(struct device *dev)
5606 {
5607         struct dmar_domain *dmar_domain;
5608         struct iommu_domain *domain;
5609         struct intel_iommu *iommu;
5610         struct iommu_group *group;
5611         u8 bus, devfn;
5612         int ret;
5613
5614         iommu = device_to_iommu(dev, &bus, &devfn);
5615         if (!iommu)
5616                 return -ENODEV;
5617
5618         iommu_device_link(&iommu->iommu, dev);
5619
5620         if (translation_pre_enabled(iommu))
5621                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5622
5623         group = iommu_group_get_for_dev(dev);
5624
5625         if (IS_ERR(group)) {
5626                 ret = PTR_ERR(group);
5627                 goto unlink;
5628         }
5629
5630         iommu_group_put(group);
5631
5632         domain = iommu_get_domain_for_dev(dev);
5633         dmar_domain = to_dmar_domain(domain);
5634         if (domain->type == IOMMU_DOMAIN_DMA) {
5635                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5636                         ret = iommu_request_dm_for_dev(dev);
5637                         if (ret) {
5638                                 dmar_remove_one_dev_info(dev);
5639                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5640                                 domain_add_dev_info(si_domain, dev);
5641                                 dev_info(dev,
5642                                          "Device uses a private identity domain.\n");
5643                         }
5644                 }
5645         } else {
5646                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5647                         ret = iommu_request_dma_domain_for_dev(dev);
5648                         if (ret) {
5649                                 dmar_remove_one_dev_info(dev);
5650                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5651                                 if (!get_private_domain_for_dev(dev)) {
5652                                         dev_warn(dev,
5653                                                  "Failed to get a private domain.\n");
5654                                         ret = -ENOMEM;
5655                                         goto unlink;
5656                                 }
5657
5658                                 dev_info(dev,
5659                                          "Device uses a private dma domain.\n");
5660                         }
5661                 }
5662         }
5663
5664         if (device_needs_bounce(dev)) {
5665                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5666                 set_dma_ops(dev, &bounce_dma_ops);
5667         }
5668
5669         return 0;
5670
5671 unlink:
5672         iommu_device_unlink(&iommu->iommu, dev);
5673         return ret;
5674 }
5675
5676 static void intel_iommu_remove_device(struct device *dev)
5677 {
5678         struct intel_iommu *iommu;
5679         u8 bus, devfn;
5680
5681         iommu = device_to_iommu(dev, &bus, &devfn);
5682         if (!iommu)
5683                 return;
5684
5685         dmar_remove_one_dev_info(dev);
5686
5687         iommu_group_remove_device(dev);
5688
5689         iommu_device_unlink(&iommu->iommu, dev);
5690
5691         if (device_needs_bounce(dev))
5692                 set_dma_ops(dev, NULL);
5693 }
5694
5695 static void intel_iommu_get_resv_regions(struct device *device,
5696                                          struct list_head *head)
5697 {
5698         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5699         struct iommu_resv_region *reg;
5700         struct dmar_rmrr_unit *rmrr;
5701         struct device *i_dev;
5702         int i;
5703
5704         down_read(&dmar_global_lock);
5705         for_each_rmrr_units(rmrr) {
5706                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5707                                           i, i_dev) {
5708                         struct iommu_resv_region *resv;
5709                         enum iommu_resv_type type;
5710                         size_t length;
5711
5712                         if (i_dev != device &&
5713                             !is_downstream_to_pci_bridge(device, i_dev))
5714                                 continue;
5715
5716                         length = rmrr->end_address - rmrr->base_address + 1;
5717
5718                         type = device_rmrr_is_relaxable(device) ?
5719                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5720
5721                         resv = iommu_alloc_resv_region(rmrr->base_address,
5722                                                        length, prot, type);
5723                         if (!resv)
5724                                 break;
5725
5726                         list_add_tail(&resv->list, head);
5727                 }
5728         }
5729         up_read(&dmar_global_lock);
5730
5731 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5732         if (dev_is_pci(device)) {
5733                 struct pci_dev *pdev = to_pci_dev(device);
5734
5735                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5736                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5737                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5738                         if (reg)
5739                                 list_add_tail(&reg->list, head);
5740                 }
5741         }
5742 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5743
5744         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5745                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5746                                       0, IOMMU_RESV_MSI);
5747         if (!reg)
5748                 return;
5749         list_add_tail(&reg->list, head);
5750 }
5751
5752 static void intel_iommu_put_resv_regions(struct device *dev,
5753                                          struct list_head *head)
5754 {
5755         struct iommu_resv_region *entry, *next;
5756
5757         list_for_each_entry_safe(entry, next, head, list)
5758                 kfree(entry);
5759 }
5760
5761 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5762 {
5763         struct device_domain_info *info;
5764         struct context_entry *context;
5765         struct dmar_domain *domain;
5766         unsigned long flags;
5767         u64 ctx_lo;
5768         int ret;
5769
5770         domain = find_domain(dev);
5771         if (!domain)
5772                 return -EINVAL;
5773
5774         spin_lock_irqsave(&device_domain_lock, flags);
5775         spin_lock(&iommu->lock);
5776
5777         ret = -EINVAL;
5778         info = dev->archdata.iommu;
5779         if (!info || !info->pasid_supported)
5780                 goto out;
5781
5782         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5783         if (WARN_ON(!context))
5784                 goto out;
5785
5786         ctx_lo = context[0].lo;
5787
5788         if (!(ctx_lo & CONTEXT_PASIDE)) {
5789                 ctx_lo |= CONTEXT_PASIDE;
5790                 context[0].lo = ctx_lo;
5791                 wmb();
5792                 iommu->flush.flush_context(iommu,
5793                                            domain->iommu_did[iommu->seq_id],
5794                                            PCI_DEVID(info->bus, info->devfn),
5795                                            DMA_CCMD_MASK_NOBIT,
5796                                            DMA_CCMD_DEVICE_INVL);
5797         }
5798
5799         /* Enable PASID support in the device, if it wasn't already */
5800         if (!info->pasid_enabled)
5801                 iommu_enable_dev_iotlb(info);
5802
5803         ret = 0;
5804
5805  out:
5806         spin_unlock(&iommu->lock);
5807         spin_unlock_irqrestore(&device_domain_lock, flags);
5808
5809         return ret;
5810 }
5811
5812 static void intel_iommu_apply_resv_region(struct device *dev,
5813                                           struct iommu_domain *domain,
5814                                           struct iommu_resv_region *region)
5815 {
5816         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5817         unsigned long start, end;
5818
5819         start = IOVA_PFN(region->start);
5820         end   = IOVA_PFN(region->start + region->length - 1);
5821
5822         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5823 }
5824
5825 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5826 {
5827         if (dev_is_pci(dev))
5828                 return pci_device_group(dev);
5829         return generic_device_group(dev);
5830 }
5831
5832 #ifdef CONFIG_INTEL_IOMMU_SVM
5833 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5834 {
5835         struct intel_iommu *iommu;
5836         u8 bus, devfn;
5837
5838         if (iommu_dummy(dev)) {
5839                 dev_warn(dev,
5840                          "No IOMMU translation for device; cannot enable SVM\n");
5841                 return NULL;
5842         }
5843
5844         iommu = device_to_iommu(dev, &bus, &devfn);
5845         if ((!iommu)) {
5846                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5847                 return NULL;
5848         }
5849
5850         return iommu;
5851 }
5852 #endif /* CONFIG_INTEL_IOMMU_SVM */
5853
5854 static int intel_iommu_enable_auxd(struct device *dev)
5855 {
5856         struct device_domain_info *info;
5857         struct intel_iommu *iommu;
5858         unsigned long flags;
5859         u8 bus, devfn;
5860         int ret;
5861
5862         iommu = device_to_iommu(dev, &bus, &devfn);
5863         if (!iommu || dmar_disabled)
5864                 return -EINVAL;
5865
5866         if (!sm_supported(iommu) || !pasid_supported(iommu))
5867                 return -EINVAL;
5868
5869         ret = intel_iommu_enable_pasid(iommu, dev);
5870         if (ret)
5871                 return -ENODEV;
5872
5873         spin_lock_irqsave(&device_domain_lock, flags);
5874         info = dev->archdata.iommu;
5875         info->auxd_enabled = 1;
5876         spin_unlock_irqrestore(&device_domain_lock, flags);
5877
5878         return 0;
5879 }
5880
5881 static int intel_iommu_disable_auxd(struct device *dev)
5882 {
5883         struct device_domain_info *info;
5884         unsigned long flags;
5885
5886         spin_lock_irqsave(&device_domain_lock, flags);
5887         info = dev->archdata.iommu;
5888         if (!WARN_ON(!info))
5889                 info->auxd_enabled = 0;
5890         spin_unlock_irqrestore(&device_domain_lock, flags);
5891
5892         return 0;
5893 }
5894
5895 /*
5896  * A PCI express designated vendor specific extended capability is defined
5897  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5898  * for system software and tools to detect endpoint devices supporting the
5899  * Intel scalable IO virtualization without host driver dependency.
5900  *
5901  * Returns the address of the matching extended capability structure within
5902  * the device's PCI configuration space or 0 if the device does not support
5903  * it.
5904  */
5905 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5906 {
5907         int pos;
5908         u16 vendor, id;
5909
5910         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5911         while (pos) {
5912                 pci_read_config_word(pdev, pos + 4, &vendor);
5913                 pci_read_config_word(pdev, pos + 8, &id);
5914                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5915                         return pos;
5916
5917                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5918         }
5919
5920         return 0;
5921 }
5922
5923 static bool
5924 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5925 {
5926         if (feat == IOMMU_DEV_FEAT_AUX) {
5927                 int ret;
5928
5929                 if (!dev_is_pci(dev) || dmar_disabled ||
5930                     !scalable_mode_support() || !iommu_pasid_support())
5931                         return false;
5932
5933                 ret = pci_pasid_features(to_pci_dev(dev));
5934                 if (ret < 0)
5935                         return false;
5936
5937                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5938         }
5939
5940         return false;
5941 }
5942
5943 static int
5944 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5945 {
5946         if (feat == IOMMU_DEV_FEAT_AUX)
5947                 return intel_iommu_enable_auxd(dev);
5948
5949         return -ENODEV;
5950 }
5951
5952 static int
5953 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5954 {
5955         if (feat == IOMMU_DEV_FEAT_AUX)
5956                 return intel_iommu_disable_auxd(dev);
5957
5958         return -ENODEV;
5959 }
5960
5961 static bool
5962 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5963 {
5964         struct device_domain_info *info = dev->archdata.iommu;
5965
5966         if (feat == IOMMU_DEV_FEAT_AUX)
5967                 return scalable_mode_support() && info && info->auxd_enabled;
5968
5969         return false;
5970 }
5971
5972 static int
5973 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5974 {
5975         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5976
5977         return dmar_domain->default_pasid > 0 ?
5978                         dmar_domain->default_pasid : -EINVAL;
5979 }
5980
5981 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5982                                            struct device *dev)
5983 {
5984         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5985 }
5986
5987 const struct iommu_ops intel_iommu_ops = {
5988         .capable                = intel_iommu_capable,
5989         .domain_alloc           = intel_iommu_domain_alloc,
5990         .domain_free            = intel_iommu_domain_free,
5991         .attach_dev             = intel_iommu_attach_device,
5992         .detach_dev             = intel_iommu_detach_device,
5993         .aux_attach_dev         = intel_iommu_aux_attach_device,
5994         .aux_detach_dev         = intel_iommu_aux_detach_device,
5995         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5996         .map                    = intel_iommu_map,
5997         .unmap                  = intel_iommu_unmap,
5998         .iova_to_phys           = intel_iommu_iova_to_phys,
5999         .add_device             = intel_iommu_add_device,
6000         .remove_device          = intel_iommu_remove_device,
6001         .get_resv_regions       = intel_iommu_get_resv_regions,
6002         .put_resv_regions       = intel_iommu_put_resv_regions,
6003         .apply_resv_region      = intel_iommu_apply_resv_region,
6004         .device_group           = intel_iommu_device_group,
6005         .dev_has_feat           = intel_iommu_dev_has_feat,
6006         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6007         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6008         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6009         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6010         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6011 };
6012
6013 static void quirk_iommu_igfx(struct pci_dev *dev)
6014 {
6015         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6016         dmar_map_gfx = 0;
6017 }
6018
6019 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6027
6028 /* Broadwell igfx malfunctions with dmar */
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6051 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6053
6054 static void quirk_iommu_rwbf(struct pci_dev *dev)
6055 {
6056         /*
6057          * Mobile 4 Series Chipset neglects to set RWBF capability,
6058          * but needs it. Same seems to hold for the desktop versions.
6059          */
6060         pci_info(dev, "Forcing write-buffer flush capability\n");
6061         rwbf_quirk = 1;
6062 }
6063
6064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6071
6072 #define GGC 0x52
6073 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6074 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6075 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6076 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6077 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6078 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6079 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6080 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6081
6082 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6083 {
6084         unsigned short ggc;
6085
6086         if (pci_read_config_word(dev, GGC, &ggc))
6087                 return;
6088
6089         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6090                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6091                 dmar_map_gfx = 0;
6092         } else if (dmar_map_gfx) {
6093                 /* we have to ensure the gfx device is idle before we flush */
6094                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6095                 intel_iommu_strict = 1;
6096        }
6097 }
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6102
6103 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6104    ISOCH DMAR unit for the Azalia sound device, but not give it any
6105    TLB entries, which causes it to deadlock. Check for that.  We do
6106    this in a function called from init_dmars(), instead of in a PCI
6107    quirk, because we don't want to print the obnoxious "BIOS broken"
6108    message if VT-d is actually disabled.
6109 */
6110 static void __init check_tylersburg_isoch(void)
6111 {
6112         struct pci_dev *pdev;
6113         uint32_t vtisochctrl;
6114
6115         /* If there's no Azalia in the system anyway, forget it. */
6116         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6117         if (!pdev)
6118                 return;
6119         pci_dev_put(pdev);
6120
6121         /* System Management Registers. Might be hidden, in which case
6122            we can't do the sanity check. But that's OK, because the
6123            known-broken BIOSes _don't_ actually hide it, so far. */
6124         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6125         if (!pdev)
6126                 return;
6127
6128         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6129                 pci_dev_put(pdev);
6130                 return;
6131         }
6132
6133         pci_dev_put(pdev);
6134
6135         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6136         if (vtisochctrl & 1)
6137                 return;
6138
6139         /* Drop all bits other than the number of TLB entries */
6140         vtisochctrl &= 0x1c;
6141
6142         /* If we have the recommended number of TLB entries (16), fine. */
6143         if (vtisochctrl == 0x10)
6144                 return;
6145
6146         /* Zero TLB entries? You get to ride the short bus to school. */
6147         if (!vtisochctrl) {
6148                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6149                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6150                      dmi_get_system_info(DMI_BIOS_VENDOR),
6151                      dmi_get_system_info(DMI_BIOS_VERSION),
6152                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6153                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6154                 return;
6155         }
6156
6157         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6158                vtisochctrl);
6159 }