]> bbs.cooldavid.org Git - net-next-2.6.git/blob - drivers/pci/intel-iommu.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[net-next-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45
46 #define ROOT_SIZE               VTD_PAGE_SIZE
47 #define CONTEXT_SIZE            VTD_PAGE_SIZE
48
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52
53 #define IOAPIC_RANGE_START      (0xfee00000)
54 #define IOAPIC_RANGE_END        (0xfeefffff)
55 #define IOVA_START_ADDR         (0x1000)
56
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58
59 #define MAX_AGAW_WIDTH 64
60
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
67                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69
70 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
73
74
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76    are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
78 {
79         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
80 }
81
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
83 {
84         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
85 }
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
87 {
88         return mm_to_dma_pfn(page_to_pfn(pg));
89 }
90 static inline unsigned long virt_to_dma_pfn(void *p)
91 {
92         return page_to_dma_pfn(virt_to_page(p));
93 }
94
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
97
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
100
101 /*
102  * 0: Present
103  * 1-11: Reserved
104  * 12-63: Context Ptr (12 - (haw-1))
105  * 64-127: Reserved
106  */
107 struct root_entry {
108         u64     val;
109         u64     rsvd1;
110 };
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
113 {
114         return (root->val & 1);
115 }
116 static inline void set_root_present(struct root_entry *root)
117 {
118         root->val |= 1;
119 }
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
121 {
122         root->val |= value & VTD_PAGE_MASK;
123 }
124
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
127 {
128         return (struct context_entry *)
129                 (root_present(root)?phys_to_virt(
130                 root->val & VTD_PAGE_MASK) :
131                 NULL);
132 }
133
134 /*
135  * low 64 bits:
136  * 0: present
137  * 1: fault processing disable
138  * 2-3: translation type
139  * 12-63: address space root
140  * high 64 bits:
141  * 0-2: address width
142  * 3-6: aval
143  * 8-23: domain id
144  */
145 struct context_entry {
146         u64 lo;
147         u64 hi;
148 };
149
150 static inline bool context_present(struct context_entry *context)
151 {
152         return (context->lo & 1);
153 }
154 static inline void context_set_present(struct context_entry *context)
155 {
156         context->lo |= 1;
157 }
158
159 static inline void context_set_fault_enable(struct context_entry *context)
160 {
161         context->lo &= (((u64)-1) << 2) | 1;
162 }
163
164 static inline void context_set_translation_type(struct context_entry *context,
165                                                 unsigned long value)
166 {
167         context->lo &= (((u64)-1) << 4) | 3;
168         context->lo |= (value & 3) << 2;
169 }
170
171 static inline void context_set_address_root(struct context_entry *context,
172                                             unsigned long value)
173 {
174         context->lo |= value & VTD_PAGE_MASK;
175 }
176
177 static inline void context_set_address_width(struct context_entry *context,
178                                              unsigned long value)
179 {
180         context->hi |= value & 7;
181 }
182
183 static inline void context_set_domain_id(struct context_entry *context,
184                                          unsigned long value)
185 {
186         context->hi |= (value & ((1 << 16) - 1)) << 8;
187 }
188
189 static inline void context_clear_entry(struct context_entry *context)
190 {
191         context->lo = 0;
192         context->hi = 0;
193 }
194
195 /*
196  * 0: readable
197  * 1: writable
198  * 2-6: reserved
199  * 7: super page
200  * 8-10: available
201  * 11: snoop behavior
202  * 12-63: Host physcial address
203  */
204 struct dma_pte {
205         u64 val;
206 };
207
208 static inline void dma_clear_pte(struct dma_pte *pte)
209 {
210         pte->val = 0;
211 }
212
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
214 {
215         pte->val |= DMA_PTE_READ;
216 }
217
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
219 {
220         pte->val |= DMA_PTE_WRITE;
221 }
222
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
224 {
225         pte->val |= DMA_PTE_SNP;
226 }
227
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
229 {
230         pte->val = (pte->val & ~3) | (prot & 3);
231 }
232
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
234 {
235 #ifdef CONFIG_64BIT
236         return pte->val & VTD_PAGE_MASK;
237 #else
238         /* Must have a full atomic 64-bit read */
239         return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
241 }
242
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
244 {
245         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
246 }
247
248 static inline bool dma_pte_present(struct dma_pte *pte)
249 {
250         return (pte->val & 3) != 0;
251 }
252
253 static inline int first_pte_in_page(struct dma_pte *pte)
254 {
255         return !((unsigned long)pte & ~VTD_PAGE_MASK);
256 }
257
258 /*
259  * This domain is a statically identity mapping domain.
260  *      1. This domain creats a static 1:1 mapping to all usable memory.
261  *      2. It maps to each iommu if successful.
262  *      3. Each iommu mapps to this domain if successful.
263  */
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
266
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
269
270 /* domain represents a virtual machine, more than one devices
271  * across iommus may be owned in one domain, e.g. kvm guest.
272  */
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
274
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
277
278 struct dmar_domain {
279         int     id;                     /* domain id */
280         int     nid;                    /* node id */
281         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
282
283         struct list_head devices;       /* all devices' list */
284         struct iova_domain iovad;       /* iova's that belong to this domain */
285
286         struct dma_pte  *pgd;           /* virtual address */
287         int             gaw;            /* max guest address width */
288
289         /* adjusted guest address width, 0 is level 2 30-bit */
290         int             agaw;
291
292         int             flags;          /* flags to find out type of domain */
293
294         int             iommu_coherency;/* indicate coherency of iommu access */
295         int             iommu_snooping; /* indicate snooping control feature*/
296         int             iommu_count;    /* reference count of iommu */
297         spinlock_t      iommu_lock;     /* protect iommu set in domain */
298         u64             max_addr;       /* maximum mapped address */
299 };
300
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303         struct list_head link;  /* link to domain siblings */
304         struct list_head global; /* link to global list */
305         int segment;            /* PCI domain */
306         u8 bus;                 /* PCI bus number */
307         u8 devfn;               /* PCI devfn number */
308         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
309         struct intel_iommu *iommu; /* IOMMU used by this device */
310         struct dmar_domain *domain; /* pointer to domain */
311 };
312
313 static void flush_unmaps_timeout(unsigned long data);
314
315 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
316
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319         int next;
320         struct iova *iova[HIGH_WATER_MARK];
321         struct dmar_domain *domain[HIGH_WATER_MARK];
322 };
323
324 static struct deferred_flush_tables *deferred_flush;
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
331
332 static int timer_on;
333 static long list_size;
334
335 static void domain_remove_dev_info(struct dmar_domain *domain);
336
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
342
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
346
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
350
351 static struct iommu_ops intel_iommu_ops;
352
353 static int __init intel_iommu_setup(char *str)
354 {
355         if (!str)
356                 return -EINVAL;
357         while (*str) {
358                 if (!strncmp(str, "on", 2)) {
359                         dmar_disabled = 0;
360                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
361                 } else if (!strncmp(str, "off", 3)) {
362                         dmar_disabled = 1;
363                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
364                 } else if (!strncmp(str, "igfx_off", 8)) {
365                         dmar_map_gfx = 0;
366                         printk(KERN_INFO
367                                 "Intel-IOMMU: disable GFX device mapping\n");
368                 } else if (!strncmp(str, "forcedac", 8)) {
369                         printk(KERN_INFO
370                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371                         dmar_forcedac = 1;
372                 } else if (!strncmp(str, "strict", 6)) {
373                         printk(KERN_INFO
374                                 "Intel-IOMMU: disable batched IOTLB flush\n");
375                         intel_iommu_strict = 1;
376                 }
377
378                 str += strcspn(str, ",");
379                 while (*str == ',')
380                         str++;
381         }
382         return 0;
383 }
384 __setup("intel_iommu=", intel_iommu_setup);
385
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
389
390 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
391 {
392         unsigned int flags;
393         void *vaddr;
394
395         /* trying to avoid low memory issues */
396         flags = current->flags & PF_MEMALLOC;
397         current->flags |= PF_MEMALLOC;
398         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
399         current->flags &= (~PF_MEMALLOC | flags);
400         return vaddr;
401 }
402
403
404 static inline void *alloc_pgtable_page(int node)
405 {
406         unsigned int flags;
407         struct page *page;
408         void *vaddr = NULL;
409
410         /* trying to avoid low memory issues */
411         flags = current->flags & PF_MEMALLOC;
412         current->flags |= PF_MEMALLOC;
413         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
414         if (page)
415                 vaddr = page_address(page);
416         current->flags &= (~PF_MEMALLOC | flags);
417         return vaddr;
418 }
419
420 static inline void free_pgtable_page(void *vaddr)
421 {
422         free_page((unsigned long)vaddr);
423 }
424
425 static inline void *alloc_domain_mem(void)
426 {
427         return iommu_kmem_cache_alloc(iommu_domain_cache);
428 }
429
430 static void free_domain_mem(void *vaddr)
431 {
432         kmem_cache_free(iommu_domain_cache, vaddr);
433 }
434
435 static inline void * alloc_devinfo_mem(void)
436 {
437         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
438 }
439
440 static inline void free_devinfo_mem(void *vaddr)
441 {
442         kmem_cache_free(iommu_devinfo_cache, vaddr);
443 }
444
445 struct iova *alloc_iova_mem(void)
446 {
447         return iommu_kmem_cache_alloc(iommu_iova_cache);
448 }
449
450 void free_iova_mem(struct iova *iova)
451 {
452         kmem_cache_free(iommu_iova_cache, iova);
453 }
454
455
456 static inline int width_to_agaw(int width);
457
458 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
459 {
460         unsigned long sagaw;
461         int agaw = -1;
462
463         sagaw = cap_sagaw(iommu->cap);
464         for (agaw = width_to_agaw(max_gaw);
465              agaw >= 0; agaw--) {
466                 if (test_bit(agaw, &sagaw))
467                         break;
468         }
469
470         return agaw;
471 }
472
473 /*
474  * Calculate max SAGAW for each iommu.
475  */
476 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
477 {
478         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
479 }
480
481 /*
482  * calculate agaw for each iommu.
483  * "SAGAW" may be different across iommus, use a default agaw, and
484  * get a supported less agaw for iommus that don't support the default agaw.
485  */
486 int iommu_calculate_agaw(struct intel_iommu *iommu)
487 {
488         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
489 }
490
491 /* This functionin only returns single iommu in a domain */
492 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
493 {
494         int iommu_id;
495
496         /* si_domain and vm domain should not get here. */
497         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
498         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
499
500         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
501         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
502                 return NULL;
503
504         return g_iommus[iommu_id];
505 }
506
507 static void domain_update_iommu_coherency(struct dmar_domain *domain)
508 {
509         int i;
510
511         domain->iommu_coherency = 1;
512
513         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
514         for (; i < g_num_of_iommus; ) {
515                 if (!ecap_coherent(g_iommus[i]->ecap)) {
516                         domain->iommu_coherency = 0;
517                         break;
518                 }
519                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
520         }
521 }
522
523 static void domain_update_iommu_snooping(struct dmar_domain *domain)
524 {
525         int i;
526
527         domain->iommu_snooping = 1;
528
529         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
530         for (; i < g_num_of_iommus; ) {
531                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
532                         domain->iommu_snooping = 0;
533                         break;
534                 }
535                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
536         }
537 }
538
539 /* Some capabilities may be different across iommus */
540 static void domain_update_iommu_cap(struct dmar_domain *domain)
541 {
542         domain_update_iommu_coherency(domain);
543         domain_update_iommu_snooping(domain);
544 }
545
546 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
547 {
548         struct dmar_drhd_unit *drhd = NULL;
549         int i;
550
551         for_each_drhd_unit(drhd) {
552                 if (drhd->ignored)
553                         continue;
554                 if (segment != drhd->segment)
555                         continue;
556
557                 for (i = 0; i < drhd->devices_cnt; i++) {
558                         if (drhd->devices[i] &&
559                             drhd->devices[i]->bus->number == bus &&
560                             drhd->devices[i]->devfn == devfn)
561                                 return drhd->iommu;
562                         if (drhd->devices[i] &&
563                             drhd->devices[i]->subordinate &&
564                             drhd->devices[i]->subordinate->number <= bus &&
565                             drhd->devices[i]->subordinate->subordinate >= bus)
566                                 return drhd->iommu;
567                 }
568
569                 if (drhd->include_all)
570                         return drhd->iommu;
571         }
572
573         return NULL;
574 }
575
576 static void domain_flush_cache(struct dmar_domain *domain,
577                                void *addr, int size)
578 {
579         if (!domain->iommu_coherency)
580                 clflush_cache_range(addr, size);
581 }
582
583 /* Gets context entry for a given bus and devfn */
584 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
585                 u8 bus, u8 devfn)
586 {
587         struct root_entry *root;
588         struct context_entry *context;
589         unsigned long phy_addr;
590         unsigned long flags;
591
592         spin_lock_irqsave(&iommu->lock, flags);
593         root = &iommu->root_entry[bus];
594         context = get_context_addr_from_root(root);
595         if (!context) {
596                 context = (struct context_entry *)
597                                 alloc_pgtable_page(iommu->node);
598                 if (!context) {
599                         spin_unlock_irqrestore(&iommu->lock, flags);
600                         return NULL;
601                 }
602                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
603                 phy_addr = virt_to_phys((void *)context);
604                 set_root_value(root, phy_addr);
605                 set_root_present(root);
606                 __iommu_flush_cache(iommu, root, sizeof(*root));
607         }
608         spin_unlock_irqrestore(&iommu->lock, flags);
609         return &context[devfn];
610 }
611
612 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
613 {
614         struct root_entry *root;
615         struct context_entry *context;
616         int ret;
617         unsigned long flags;
618
619         spin_lock_irqsave(&iommu->lock, flags);
620         root = &iommu->root_entry[bus];
621         context = get_context_addr_from_root(root);
622         if (!context) {
623                 ret = 0;
624                 goto out;
625         }
626         ret = context_present(&context[devfn]);
627 out:
628         spin_unlock_irqrestore(&iommu->lock, flags);
629         return ret;
630 }
631
632 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
633 {
634         struct root_entry *root;
635         struct context_entry *context;
636         unsigned long flags;
637
638         spin_lock_irqsave(&iommu->lock, flags);
639         root = &iommu->root_entry[bus];
640         context = get_context_addr_from_root(root);
641         if (context) {
642                 context_clear_entry(&context[devfn]);
643                 __iommu_flush_cache(iommu, &context[devfn], \
644                         sizeof(*context));
645         }
646         spin_unlock_irqrestore(&iommu->lock, flags);
647 }
648
649 static void free_context_table(struct intel_iommu *iommu)
650 {
651         struct root_entry *root;
652         int i;
653         unsigned long flags;
654         struct context_entry *context;
655
656         spin_lock_irqsave(&iommu->lock, flags);
657         if (!iommu->root_entry) {
658                 goto out;
659         }
660         for (i = 0; i < ROOT_ENTRY_NR; i++) {
661                 root = &iommu->root_entry[i];
662                 context = get_context_addr_from_root(root);
663                 if (context)
664                         free_pgtable_page(context);
665         }
666         free_pgtable_page(iommu->root_entry);
667         iommu->root_entry = NULL;
668 out:
669         spin_unlock_irqrestore(&iommu->lock, flags);
670 }
671
672 /* page table handling */
673 #define LEVEL_STRIDE            (9)
674 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
675
676 static inline int agaw_to_level(int agaw)
677 {
678         return agaw + 2;
679 }
680
681 static inline int agaw_to_width(int agaw)
682 {
683         return 30 + agaw * LEVEL_STRIDE;
684
685 }
686
687 static inline int width_to_agaw(int width)
688 {
689         return (width - 30) / LEVEL_STRIDE;
690 }
691
692 static inline unsigned int level_to_offset_bits(int level)
693 {
694         return (level - 1) * LEVEL_STRIDE;
695 }
696
697 static inline int pfn_level_offset(unsigned long pfn, int level)
698 {
699         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
700 }
701
702 static inline unsigned long level_mask(int level)
703 {
704         return -1UL << level_to_offset_bits(level);
705 }
706
707 static inline unsigned long level_size(int level)
708 {
709         return 1UL << level_to_offset_bits(level);
710 }
711
712 static inline unsigned long align_to_level(unsigned long pfn, int level)
713 {
714         return (pfn + level_size(level) - 1) & level_mask(level);
715 }
716
717 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
718                                       unsigned long pfn)
719 {
720         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
721         struct dma_pte *parent, *pte = NULL;
722         int level = agaw_to_level(domain->agaw);
723         int offset;
724
725         BUG_ON(!domain->pgd);
726         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
727         parent = domain->pgd;
728
729         while (level > 0) {
730                 void *tmp_page;
731
732                 offset = pfn_level_offset(pfn, level);
733                 pte = &parent[offset];
734                 if (level == 1)
735                         break;
736
737                 if (!dma_pte_present(pte)) {
738                         uint64_t pteval;
739
740                         tmp_page = alloc_pgtable_page(domain->nid);
741
742                         if (!tmp_page)
743                                 return NULL;
744
745                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
746                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
747                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
748                                 /* Someone else set it while we were thinking; use theirs. */
749                                 free_pgtable_page(tmp_page);
750                         } else {
751                                 dma_pte_addr(pte);
752                                 domain_flush_cache(domain, pte, sizeof(*pte));
753                         }
754                 }
755                 parent = phys_to_virt(dma_pte_addr(pte));
756                 level--;
757         }
758
759         return pte;
760 }
761
762 /* return address's pte at specific level */
763 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
764                                          unsigned long pfn,
765                                          int level)
766 {
767         struct dma_pte *parent, *pte = NULL;
768         int total = agaw_to_level(domain->agaw);
769         int offset;
770
771         parent = domain->pgd;
772         while (level <= total) {
773                 offset = pfn_level_offset(pfn, total);
774                 pte = &parent[offset];
775                 if (level == total)
776                         return pte;
777
778                 if (!dma_pte_present(pte))
779                         break;
780                 parent = phys_to_virt(dma_pte_addr(pte));
781                 total--;
782         }
783         return NULL;
784 }
785
786 /* clear last level pte, a tlb flush should be followed */
787 static void dma_pte_clear_range(struct dmar_domain *domain,
788                                 unsigned long start_pfn,
789                                 unsigned long last_pfn)
790 {
791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
792         struct dma_pte *first_pte, *pte;
793
794         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
795         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
796         BUG_ON(start_pfn > last_pfn);
797
798         /* we don't need lock here; nobody else touches the iova range */
799         do {
800                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
801                 if (!pte) {
802                         start_pfn = align_to_level(start_pfn + 1, 2);
803                         continue;
804                 }
805                 do { 
806                         dma_clear_pte(pte);
807                         start_pfn++;
808                         pte++;
809                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
810
811                 domain_flush_cache(domain, first_pte,
812                                    (void *)pte - (void *)first_pte);
813
814         } while (start_pfn && start_pfn <= last_pfn);
815 }
816
817 /* free page table pages. last level pte should already be cleared */
818 static void dma_pte_free_pagetable(struct dmar_domain *domain,
819                                    unsigned long start_pfn,
820                                    unsigned long last_pfn)
821 {
822         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
823         struct dma_pte *first_pte, *pte;
824         int total = agaw_to_level(domain->agaw);
825         int level;
826         unsigned long tmp;
827
828         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
829         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
830         BUG_ON(start_pfn > last_pfn);
831
832         /* We don't need lock here; nobody else touches the iova range */
833         level = 2;
834         while (level <= total) {
835                 tmp = align_to_level(start_pfn, level);
836
837                 /* If we can't even clear one PTE at this level, we're done */
838                 if (tmp + level_size(level) - 1 > last_pfn)
839                         return;
840
841                 do {
842                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
843                         if (!pte) {
844                                 tmp = align_to_level(tmp + 1, level + 1);
845                                 continue;
846                         }
847                         do {
848                                 if (dma_pte_present(pte)) {
849                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
850                                         dma_clear_pte(pte);
851                                 }
852                                 pte++;
853                                 tmp += level_size(level);
854                         } while (!first_pte_in_page(pte) &&
855                                  tmp + level_size(level) - 1 <= last_pfn);
856
857                         domain_flush_cache(domain, first_pte,
858                                            (void *)pte - (void *)first_pte);
859                         
860                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
861                 level++;
862         }
863         /* free pgd */
864         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
865                 free_pgtable_page(domain->pgd);
866                 domain->pgd = NULL;
867         }
868 }
869
870 /* iommu handling */
871 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
872 {
873         struct root_entry *root;
874         unsigned long flags;
875
876         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
877         if (!root)
878                 return -ENOMEM;
879
880         __iommu_flush_cache(iommu, root, ROOT_SIZE);
881
882         spin_lock_irqsave(&iommu->lock, flags);
883         iommu->root_entry = root;
884         spin_unlock_irqrestore(&iommu->lock, flags);
885
886         return 0;
887 }
888
889 static void iommu_set_root_entry(struct intel_iommu *iommu)
890 {
891         void *addr;
892         u32 sts;
893         unsigned long flag;
894
895         addr = iommu->root_entry;
896
897         spin_lock_irqsave(&iommu->register_lock, flag);
898         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
899
900         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
901
902         /* Make sure hardware complete it */
903         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
904                       readl, (sts & DMA_GSTS_RTPS), sts);
905
906         spin_unlock_irqrestore(&iommu->register_lock, flag);
907 }
908
909 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
910 {
911         u32 val;
912         unsigned long flag;
913
914         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
915                 return;
916
917         spin_lock_irqsave(&iommu->register_lock, flag);
918         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
919
920         /* Make sure hardware complete it */
921         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
922                       readl, (!(val & DMA_GSTS_WBFS)), val);
923
924         spin_unlock_irqrestore(&iommu->register_lock, flag);
925 }
926
927 /* return value determine if we need a write buffer flush */
928 static void __iommu_flush_context(struct intel_iommu *iommu,
929                                   u16 did, u16 source_id, u8 function_mask,
930                                   u64 type)
931 {
932         u64 val = 0;
933         unsigned long flag;
934
935         switch (type) {
936         case DMA_CCMD_GLOBAL_INVL:
937                 val = DMA_CCMD_GLOBAL_INVL;
938                 break;
939         case DMA_CCMD_DOMAIN_INVL:
940                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
941                 break;
942         case DMA_CCMD_DEVICE_INVL:
943                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
944                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
945                 break;
946         default:
947                 BUG();
948         }
949         val |= DMA_CCMD_ICC;
950
951         spin_lock_irqsave(&iommu->register_lock, flag);
952         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
953
954         /* Make sure hardware complete it */
955         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
956                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
957
958         spin_unlock_irqrestore(&iommu->register_lock, flag);
959 }
960
961 /* return value determine if we need a write buffer flush */
962 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
963                                 u64 addr, unsigned int size_order, u64 type)
964 {
965         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
966         u64 val = 0, val_iva = 0;
967         unsigned long flag;
968
969         switch (type) {
970         case DMA_TLB_GLOBAL_FLUSH:
971                 /* global flush doesn't need set IVA_REG */
972                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
973                 break;
974         case DMA_TLB_DSI_FLUSH:
975                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
976                 break;
977         case DMA_TLB_PSI_FLUSH:
978                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
979                 /* Note: always flush non-leaf currently */
980                 val_iva = size_order | addr;
981                 break;
982         default:
983                 BUG();
984         }
985         /* Note: set drain read/write */
986 #if 0
987         /*
988          * This is probably to be super secure.. Looks like we can
989          * ignore it without any impact.
990          */
991         if (cap_read_drain(iommu->cap))
992                 val |= DMA_TLB_READ_DRAIN;
993 #endif
994         if (cap_write_drain(iommu->cap))
995                 val |= DMA_TLB_WRITE_DRAIN;
996
997         spin_lock_irqsave(&iommu->register_lock, flag);
998         /* Note: Only uses first TLB reg currently */
999         if (val_iva)
1000                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1001         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1002
1003         /* Make sure hardware complete it */
1004         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1005                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1006
1007         spin_unlock_irqrestore(&iommu->register_lock, flag);
1008
1009         /* check IOTLB invalidation granularity */
1010         if (DMA_TLB_IAIG(val) == 0)
1011                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1012         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1013                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1014                         (unsigned long long)DMA_TLB_IIRG(type),
1015                         (unsigned long long)DMA_TLB_IAIG(val));
1016 }
1017
1018 static struct device_domain_info *iommu_support_dev_iotlb(
1019         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1020 {
1021         int found = 0;
1022         unsigned long flags;
1023         struct device_domain_info *info;
1024         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1025
1026         if (!ecap_dev_iotlb_support(iommu->ecap))
1027                 return NULL;
1028
1029         if (!iommu->qi)
1030                 return NULL;
1031
1032         spin_lock_irqsave(&device_domain_lock, flags);
1033         list_for_each_entry(info, &domain->devices, link)
1034                 if (info->bus == bus && info->devfn == devfn) {
1035                         found = 1;
1036                         break;
1037                 }
1038         spin_unlock_irqrestore(&device_domain_lock, flags);
1039
1040         if (!found || !info->dev)
1041                 return NULL;
1042
1043         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1044                 return NULL;
1045
1046         if (!dmar_find_matched_atsr_unit(info->dev))
1047                 return NULL;
1048
1049         info->iommu = iommu;
1050
1051         return info;
1052 }
1053
1054 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1055 {
1056         if (!info)
1057                 return;
1058
1059         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1060 }
1061
1062 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1063 {
1064         if (!info->dev || !pci_ats_enabled(info->dev))
1065                 return;
1066
1067         pci_disable_ats(info->dev);
1068 }
1069
1070 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1071                                   u64 addr, unsigned mask)
1072 {
1073         u16 sid, qdep;
1074         unsigned long flags;
1075         struct device_domain_info *info;
1076
1077         spin_lock_irqsave(&device_domain_lock, flags);
1078         list_for_each_entry(info, &domain->devices, link) {
1079                 if (!info->dev || !pci_ats_enabled(info->dev))
1080                         continue;
1081
1082                 sid = info->bus << 8 | info->devfn;
1083                 qdep = pci_ats_queue_depth(info->dev);
1084                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1085         }
1086         spin_unlock_irqrestore(&device_domain_lock, flags);
1087 }
1088
1089 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1090                                   unsigned long pfn, unsigned int pages)
1091 {
1092         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1093         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1094
1095         BUG_ON(pages == 0);
1096
1097         /*
1098          * Fallback to domain selective flush if no PSI support or the size is
1099          * too big.
1100          * PSI requires page size to be 2 ^ x, and the base address is naturally
1101          * aligned to the size
1102          */
1103         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1104                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1105                                                 DMA_TLB_DSI_FLUSH);
1106         else
1107                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1108                                                 DMA_TLB_PSI_FLUSH);
1109
1110         /*
1111          * In caching mode, domain ID 0 is reserved for non-present to present
1112          * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1113          */
1114         if (!cap_caching_mode(iommu->cap) || did)
1115                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1116 }
1117
1118 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1119 {
1120         u32 pmen;
1121         unsigned long flags;
1122
1123         spin_lock_irqsave(&iommu->register_lock, flags);
1124         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1125         pmen &= ~DMA_PMEN_EPM;
1126         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1127
1128         /* wait for the protected region status bit to clear */
1129         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1130                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1131
1132         spin_unlock_irqrestore(&iommu->register_lock, flags);
1133 }
1134
1135 static int iommu_enable_translation(struct intel_iommu *iommu)
1136 {
1137         u32 sts;
1138         unsigned long flags;
1139
1140         spin_lock_irqsave(&iommu->register_lock, flags);
1141         iommu->gcmd |= DMA_GCMD_TE;
1142         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (sts & DMA_GSTS_TES), sts);
1147
1148         spin_unlock_irqrestore(&iommu->register_lock, flags);
1149         return 0;
1150 }
1151
1152 static int iommu_disable_translation(struct intel_iommu *iommu)
1153 {
1154         u32 sts;
1155         unsigned long flag;
1156
1157         spin_lock_irqsave(&iommu->register_lock, flag);
1158         iommu->gcmd &= ~DMA_GCMD_TE;
1159         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1160
1161         /* Make sure hardware complete it */
1162         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1163                       readl, (!(sts & DMA_GSTS_TES)), sts);
1164
1165         spin_unlock_irqrestore(&iommu->register_lock, flag);
1166         return 0;
1167 }
1168
1169
1170 static int iommu_init_domains(struct intel_iommu *iommu)
1171 {
1172         unsigned long ndomains;
1173         unsigned long nlongs;
1174
1175         ndomains = cap_ndoms(iommu->cap);
1176         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1177         nlongs = BITS_TO_LONGS(ndomains);
1178
1179         spin_lock_init(&iommu->lock);
1180
1181         /* TBD: there might be 64K domains,
1182          * consider other allocation for future chip
1183          */
1184         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1185         if (!iommu->domain_ids) {
1186                 printk(KERN_ERR "Allocating domain id array failed\n");
1187                 return -ENOMEM;
1188         }
1189         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1190                         GFP_KERNEL);
1191         if (!iommu->domains) {
1192                 printk(KERN_ERR "Allocating domain array failed\n");
1193                 return -ENOMEM;
1194         }
1195
1196         /*
1197          * if Caching mode is set, then invalid translations are tagged
1198          * with domainid 0. Hence we need to pre-allocate it.
1199          */
1200         if (cap_caching_mode(iommu->cap))
1201                 set_bit(0, iommu->domain_ids);
1202         return 0;
1203 }
1204
1205
1206 static void domain_exit(struct dmar_domain *domain);
1207 static void vm_domain_exit(struct dmar_domain *domain);
1208
1209 void free_dmar_iommu(struct intel_iommu *iommu)
1210 {
1211         struct dmar_domain *domain;
1212         int i;
1213         unsigned long flags;
1214
1215         if ((iommu->domains) && (iommu->domain_ids)) {
1216                 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1217                 for (; i < cap_ndoms(iommu->cap); ) {
1218                         domain = iommu->domains[i];
1219                         clear_bit(i, iommu->domain_ids);
1220
1221                         spin_lock_irqsave(&domain->iommu_lock, flags);
1222                         if (--domain->iommu_count == 0) {
1223                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1224                                         vm_domain_exit(domain);
1225                                 else
1226                                         domain_exit(domain);
1227                         }
1228                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1229
1230                         i = find_next_bit(iommu->domain_ids,
1231                                 cap_ndoms(iommu->cap), i+1);
1232                 }
1233         }
1234
1235         if (iommu->gcmd & DMA_GCMD_TE)
1236                 iommu_disable_translation(iommu);
1237
1238         if (iommu->irq) {
1239                 set_irq_data(iommu->irq, NULL);
1240                 /* This will mask the irq */
1241                 free_irq(iommu->irq, iommu);
1242                 destroy_irq(iommu->irq);
1243         }
1244
1245         kfree(iommu->domains);
1246         kfree(iommu->domain_ids);
1247
1248         g_iommus[iommu->seq_id] = NULL;
1249
1250         /* if all iommus are freed, free g_iommus */
1251         for (i = 0; i < g_num_of_iommus; i++) {
1252                 if (g_iommus[i])
1253                         break;
1254         }
1255
1256         if (i == g_num_of_iommus)
1257                 kfree(g_iommus);
1258
1259         /* free context mapping */
1260         free_context_table(iommu);
1261 }
1262
1263 static struct dmar_domain *alloc_domain(void)
1264 {
1265         struct dmar_domain *domain;
1266
1267         domain = alloc_domain_mem();
1268         if (!domain)
1269                 return NULL;
1270
1271         domain->nid = -1;
1272         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1273         domain->flags = 0;
1274
1275         return domain;
1276 }
1277
1278 static int iommu_attach_domain(struct dmar_domain *domain,
1279                                struct intel_iommu *iommu)
1280 {
1281         int num;
1282         unsigned long ndomains;
1283         unsigned long flags;
1284
1285         ndomains = cap_ndoms(iommu->cap);
1286
1287         spin_lock_irqsave(&iommu->lock, flags);
1288
1289         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1290         if (num >= ndomains) {
1291                 spin_unlock_irqrestore(&iommu->lock, flags);
1292                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1293                 return -ENOMEM;
1294         }
1295
1296         domain->id = num;
1297         set_bit(num, iommu->domain_ids);
1298         set_bit(iommu->seq_id, &domain->iommu_bmp);
1299         iommu->domains[num] = domain;
1300         spin_unlock_irqrestore(&iommu->lock, flags);
1301
1302         return 0;
1303 }
1304
1305 static void iommu_detach_domain(struct dmar_domain *domain,
1306                                 struct intel_iommu *iommu)
1307 {
1308         unsigned long flags;
1309         int num, ndomains;
1310         int found = 0;
1311
1312         spin_lock_irqsave(&iommu->lock, flags);
1313         ndomains = cap_ndoms(iommu->cap);
1314         num = find_first_bit(iommu->domain_ids, ndomains);
1315         for (; num < ndomains; ) {
1316                 if (iommu->domains[num] == domain) {
1317                         found = 1;
1318                         break;
1319                 }
1320                 num = find_next_bit(iommu->domain_ids,
1321                                     cap_ndoms(iommu->cap), num+1);
1322         }
1323
1324         if (found) {
1325                 clear_bit(num, iommu->domain_ids);
1326                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1327                 iommu->domains[num] = NULL;
1328         }
1329         spin_unlock_irqrestore(&iommu->lock, flags);
1330 }
1331
1332 static struct iova_domain reserved_iova_list;
1333 static struct lock_class_key reserved_rbtree_key;
1334
1335 static void dmar_init_reserved_ranges(void)
1336 {
1337         struct pci_dev *pdev = NULL;
1338         struct iova *iova;
1339         int i;
1340
1341         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1342
1343         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1344                 &reserved_rbtree_key);
1345
1346         /* IOAPIC ranges shouldn't be accessed by DMA */
1347         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1348                 IOVA_PFN(IOAPIC_RANGE_END));
1349         if (!iova)
1350                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1351
1352         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1353         for_each_pci_dev(pdev) {
1354                 struct resource *r;
1355
1356                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1357                         r = &pdev->resource[i];
1358                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1359                                 continue;
1360                         iova = reserve_iova(&reserved_iova_list,
1361                                             IOVA_PFN(r->start),
1362                                             IOVA_PFN(r->end));
1363                         if (!iova)
1364                                 printk(KERN_ERR "Reserve iova failed\n");
1365                 }
1366         }
1367
1368 }
1369
1370 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1371 {
1372         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1373 }
1374
1375 static inline int guestwidth_to_adjustwidth(int gaw)
1376 {
1377         int agaw;
1378         int r = (gaw - 12) % 9;
1379
1380         if (r == 0)
1381                 agaw = gaw;
1382         else
1383                 agaw = gaw + 9 - r;
1384         if (agaw > 64)
1385                 agaw = 64;
1386         return agaw;
1387 }
1388
1389 static int domain_init(struct dmar_domain *domain, int guest_width)
1390 {
1391         struct intel_iommu *iommu;
1392         int adjust_width, agaw;
1393         unsigned long sagaw;
1394
1395         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1396         spin_lock_init(&domain->iommu_lock);
1397
1398         domain_reserve_special_ranges(domain);
1399
1400         /* calculate AGAW */
1401         iommu = domain_get_iommu(domain);
1402         if (guest_width > cap_mgaw(iommu->cap))
1403                 guest_width = cap_mgaw(iommu->cap);
1404         domain->gaw = guest_width;
1405         adjust_width = guestwidth_to_adjustwidth(guest_width);
1406         agaw = width_to_agaw(adjust_width);
1407         sagaw = cap_sagaw(iommu->cap);
1408         if (!test_bit(agaw, &sagaw)) {
1409                 /* hardware doesn't support it, choose a bigger one */
1410                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1411                 agaw = find_next_bit(&sagaw, 5, agaw);
1412                 if (agaw >= 5)
1413                         return -ENODEV;
1414         }
1415         domain->agaw = agaw;
1416         INIT_LIST_HEAD(&domain->devices);
1417
1418         if (ecap_coherent(iommu->ecap))
1419                 domain->iommu_coherency = 1;
1420         else
1421                 domain->iommu_coherency = 0;
1422
1423         if (ecap_sc_support(iommu->ecap))
1424                 domain->iommu_snooping = 1;
1425         else
1426                 domain->iommu_snooping = 0;
1427
1428         domain->iommu_count = 1;
1429         domain->nid = iommu->node;
1430
1431         /* always allocate the top pgd */
1432         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1433         if (!domain->pgd)
1434                 return -ENOMEM;
1435         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1436         return 0;
1437 }
1438
1439 static void domain_exit(struct dmar_domain *domain)
1440 {
1441         struct dmar_drhd_unit *drhd;
1442         struct intel_iommu *iommu;
1443
1444         /* Domain 0 is reserved, so dont process it */
1445         if (!domain)
1446                 return;
1447
1448         domain_remove_dev_info(domain);
1449         /* destroy iovas */
1450         put_iova_domain(&domain->iovad);
1451
1452         /* clear ptes */
1453         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1454
1455         /* free page tables */
1456         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1457
1458         for_each_active_iommu(iommu, drhd)
1459                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1460                         iommu_detach_domain(domain, iommu);
1461
1462         free_domain_mem(domain);
1463 }
1464
1465 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1466                                  u8 bus, u8 devfn, int translation)
1467 {
1468         struct context_entry *context;
1469         unsigned long flags;
1470         struct intel_iommu *iommu;
1471         struct dma_pte *pgd;
1472         unsigned long num;
1473         unsigned long ndomains;
1474         int id;
1475         int agaw;
1476         struct device_domain_info *info = NULL;
1477
1478         pr_debug("Set context mapping for %02x:%02x.%d\n",
1479                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1480
1481         BUG_ON(!domain->pgd);
1482         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1483                translation != CONTEXT_TT_MULTI_LEVEL);
1484
1485         iommu = device_to_iommu(segment, bus, devfn);
1486         if (!iommu)
1487                 return -ENODEV;
1488
1489         context = device_to_context_entry(iommu, bus, devfn);
1490         if (!context)
1491                 return -ENOMEM;
1492         spin_lock_irqsave(&iommu->lock, flags);
1493         if (context_present(context)) {
1494                 spin_unlock_irqrestore(&iommu->lock, flags);
1495                 return 0;
1496         }
1497
1498         id = domain->id;
1499         pgd = domain->pgd;
1500
1501         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1502             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1503                 int found = 0;
1504
1505                 /* find an available domain id for this device in iommu */
1506                 ndomains = cap_ndoms(iommu->cap);
1507                 num = find_first_bit(iommu->domain_ids, ndomains);
1508                 for (; num < ndomains; ) {
1509                         if (iommu->domains[num] == domain) {
1510                                 id = num;
1511                                 found = 1;
1512                                 break;
1513                         }
1514                         num = find_next_bit(iommu->domain_ids,
1515                                             cap_ndoms(iommu->cap), num+1);
1516                 }
1517
1518                 if (found == 0) {
1519                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1520                         if (num >= ndomains) {
1521                                 spin_unlock_irqrestore(&iommu->lock, flags);
1522                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1523                                 return -EFAULT;
1524                         }
1525
1526                         set_bit(num, iommu->domain_ids);
1527                         iommu->domains[num] = domain;
1528                         id = num;
1529                 }
1530
1531                 /* Skip top levels of page tables for
1532                  * iommu which has less agaw than default.
1533                  */
1534                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1535                         pgd = phys_to_virt(dma_pte_addr(pgd));
1536                         if (!dma_pte_present(pgd)) {
1537                                 spin_unlock_irqrestore(&iommu->lock, flags);
1538                                 return -ENOMEM;
1539                         }
1540                 }
1541         }
1542
1543         context_set_domain_id(context, id);
1544
1545         if (translation != CONTEXT_TT_PASS_THROUGH) {
1546                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1547                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1548                                      CONTEXT_TT_MULTI_LEVEL;
1549         }
1550         /*
1551          * In pass through mode, AW must be programmed to indicate the largest
1552          * AGAW value supported by hardware. And ASR is ignored by hardware.
1553          */
1554         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1555                 context_set_address_width(context, iommu->msagaw);
1556         else {
1557                 context_set_address_root(context, virt_to_phys(pgd));
1558                 context_set_address_width(context, iommu->agaw);
1559         }
1560
1561         context_set_translation_type(context, translation);
1562         context_set_fault_enable(context);
1563         context_set_present(context);
1564         domain_flush_cache(domain, context, sizeof(*context));
1565
1566         /*
1567          * It's a non-present to present mapping. If hardware doesn't cache
1568          * non-present entry we only need to flush the write-buffer. If the
1569          * _does_ cache non-present entries, then it does so in the special
1570          * domain #0, which we have to flush:
1571          */
1572         if (cap_caching_mode(iommu->cap)) {
1573                 iommu->flush.flush_context(iommu, 0,
1574                                            (((u16)bus) << 8) | devfn,
1575                                            DMA_CCMD_MASK_NOBIT,
1576                                            DMA_CCMD_DEVICE_INVL);
1577                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1578         } else {
1579                 iommu_flush_write_buffer(iommu);
1580         }
1581         iommu_enable_dev_iotlb(info);
1582         spin_unlock_irqrestore(&iommu->lock, flags);
1583
1584         spin_lock_irqsave(&domain->iommu_lock, flags);
1585         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1586                 domain->iommu_count++;
1587                 if (domain->iommu_count == 1)
1588                         domain->nid = iommu->node;
1589                 domain_update_iommu_cap(domain);
1590         }
1591         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1592         return 0;
1593 }
1594
1595 static int
1596 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1597                         int translation)
1598 {
1599         int ret;
1600         struct pci_dev *tmp, *parent;
1601
1602         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1603                                          pdev->bus->number, pdev->devfn,
1604                                          translation);
1605         if (ret)
1606                 return ret;
1607
1608         /* dependent device mapping */
1609         tmp = pci_find_upstream_pcie_bridge(pdev);
1610         if (!tmp)
1611                 return 0;
1612         /* Secondary interface's bus number and devfn 0 */
1613         parent = pdev->bus->self;
1614         while (parent != tmp) {
1615                 ret = domain_context_mapping_one(domain,
1616                                                  pci_domain_nr(parent->bus),
1617                                                  parent->bus->number,
1618                                                  parent->devfn, translation);
1619                 if (ret)
1620                         return ret;
1621                 parent = parent->bus->self;
1622         }
1623         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1624                 return domain_context_mapping_one(domain,
1625                                         pci_domain_nr(tmp->subordinate),
1626                                         tmp->subordinate->number, 0,
1627                                         translation);
1628         else /* this is a legacy PCI bridge */
1629                 return domain_context_mapping_one(domain,
1630                                                   pci_domain_nr(tmp->bus),
1631                                                   tmp->bus->number,
1632                                                   tmp->devfn,
1633                                                   translation);
1634 }
1635
1636 static int domain_context_mapped(struct pci_dev *pdev)
1637 {
1638         int ret;
1639         struct pci_dev *tmp, *parent;
1640         struct intel_iommu *iommu;
1641
1642         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1643                                 pdev->devfn);
1644         if (!iommu)
1645                 return -ENODEV;
1646
1647         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1648         if (!ret)
1649                 return ret;
1650         /* dependent device mapping */
1651         tmp = pci_find_upstream_pcie_bridge(pdev);
1652         if (!tmp)
1653                 return ret;
1654         /* Secondary interface's bus number and devfn 0 */
1655         parent = pdev->bus->self;
1656         while (parent != tmp) {
1657                 ret = device_context_mapped(iommu, parent->bus->number,
1658                                             parent->devfn);
1659                 if (!ret)
1660                         return ret;
1661                 parent = parent->bus->self;
1662         }
1663         if (tmp->is_pcie)
1664                 return device_context_mapped(iommu, tmp->subordinate->number,
1665                                              0);
1666         else
1667                 return device_context_mapped(iommu, tmp->bus->number,
1668                                              tmp->devfn);
1669 }
1670
1671 /* Returns a number of VTD pages, but aligned to MM page size */
1672 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1673                                             size_t size)
1674 {
1675         host_addr &= ~PAGE_MASK;
1676         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1677 }
1678
1679 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1680                             struct scatterlist *sg, unsigned long phys_pfn,
1681                             unsigned long nr_pages, int prot)
1682 {
1683         struct dma_pte *first_pte = NULL, *pte = NULL;
1684         phys_addr_t uninitialized_var(pteval);
1685         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1686         unsigned long sg_res;
1687
1688         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1689
1690         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1691                 return -EINVAL;
1692
1693         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1694
1695         if (sg)
1696                 sg_res = 0;
1697         else {
1698                 sg_res = nr_pages + 1;
1699                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1700         }
1701
1702         while (nr_pages--) {
1703                 uint64_t tmp;
1704
1705                 if (!sg_res) {
1706                         sg_res = aligned_nrpages(sg->offset, sg->length);
1707                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1708                         sg->dma_length = sg->length;
1709                         pteval = page_to_phys(sg_page(sg)) | prot;
1710                 }
1711                 if (!pte) {
1712                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1713                         if (!pte)
1714                                 return -ENOMEM;
1715                 }
1716                 /* We don't need lock here, nobody else
1717                  * touches the iova range
1718                  */
1719                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1720                 if (tmp) {
1721                         static int dumps = 5;
1722                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1723                                iov_pfn, tmp, (unsigned long long)pteval);
1724                         if (dumps) {
1725                                 dumps--;
1726                                 debug_dma_dump_mappings(NULL);
1727                         }
1728                         WARN_ON(1);
1729                 }
1730                 pte++;
1731                 if (!nr_pages || first_pte_in_page(pte)) {
1732                         domain_flush_cache(domain, first_pte,
1733                                            (void *)pte - (void *)first_pte);
1734                         pte = NULL;
1735                 }
1736                 iov_pfn++;
1737                 pteval += VTD_PAGE_SIZE;
1738                 sg_res--;
1739                 if (!sg_res)
1740                         sg = sg_next(sg);
1741         }
1742         return 0;
1743 }
1744
1745 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1746                                     struct scatterlist *sg, unsigned long nr_pages,
1747                                     int prot)
1748 {
1749         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1750 }
1751
1752 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1753                                      unsigned long phys_pfn, unsigned long nr_pages,
1754                                      int prot)
1755 {
1756         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1757 }
1758
1759 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1760 {
1761         if (!iommu)
1762                 return;
1763
1764         clear_context_table(iommu, bus, devfn);
1765         iommu->flush.flush_context(iommu, 0, 0, 0,
1766                                            DMA_CCMD_GLOBAL_INVL);
1767         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1768 }
1769
1770 static void domain_remove_dev_info(struct dmar_domain *domain)
1771 {
1772         struct device_domain_info *info;
1773         unsigned long flags;
1774         struct intel_iommu *iommu;
1775
1776         spin_lock_irqsave(&device_domain_lock, flags);
1777         while (!list_empty(&domain->devices)) {
1778                 info = list_entry(domain->devices.next,
1779                         struct device_domain_info, link);
1780                 list_del(&info->link);
1781                 list_del(&info->global);
1782                 if (info->dev)
1783                         info->dev->dev.archdata.iommu = NULL;
1784                 spin_unlock_irqrestore(&device_domain_lock, flags);
1785
1786                 iommu_disable_dev_iotlb(info);
1787                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1788                 iommu_detach_dev(iommu, info->bus, info->devfn);
1789                 free_devinfo_mem(info);
1790
1791                 spin_lock_irqsave(&device_domain_lock, flags);
1792         }
1793         spin_unlock_irqrestore(&device_domain_lock, flags);
1794 }
1795
1796 /*
1797  * find_domain
1798  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1799  */
1800 static struct dmar_domain *
1801 find_domain(struct pci_dev *pdev)
1802 {
1803         struct device_domain_info *info;
1804
1805         /* No lock here, assumes no domain exit in normal case */
1806         info = pdev->dev.archdata.iommu;
1807         if (info)
1808                 return info->domain;
1809         return NULL;
1810 }
1811
1812 /* domain is initialized */
1813 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1814 {
1815         struct dmar_domain *domain, *found = NULL;
1816         struct intel_iommu *iommu;
1817         struct dmar_drhd_unit *drhd;
1818         struct device_domain_info *info, *tmp;
1819         struct pci_dev *dev_tmp;
1820         unsigned long flags;
1821         int bus = 0, devfn = 0;
1822         int segment;
1823         int ret;
1824
1825         domain = find_domain(pdev);
1826         if (domain)
1827                 return domain;
1828
1829         segment = pci_domain_nr(pdev->bus);
1830
1831         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1832         if (dev_tmp) {
1833                 if (dev_tmp->is_pcie) {
1834                         bus = dev_tmp->subordinate->number;
1835                         devfn = 0;
1836                 } else {
1837                         bus = dev_tmp->bus->number;
1838                         devfn = dev_tmp->devfn;
1839                 }
1840                 spin_lock_irqsave(&device_domain_lock, flags);
1841                 list_for_each_entry(info, &device_domain_list, global) {
1842                         if (info->segment == segment &&
1843                             info->bus == bus && info->devfn == devfn) {
1844                                 found = info->domain;
1845                                 break;
1846                         }
1847                 }
1848                 spin_unlock_irqrestore(&device_domain_lock, flags);
1849                 /* pcie-pci bridge already has a domain, uses it */
1850                 if (found) {
1851                         domain = found;
1852                         goto found_domain;
1853                 }
1854         }
1855
1856         domain = alloc_domain();
1857         if (!domain)
1858                 goto error;
1859
1860         /* Allocate new domain for the device */
1861         drhd = dmar_find_matched_drhd_unit(pdev);
1862         if (!drhd) {
1863                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1864                         pci_name(pdev));
1865                 return NULL;
1866         }
1867         iommu = drhd->iommu;
1868
1869         ret = iommu_attach_domain(domain, iommu);
1870         if (ret) {
1871                 domain_exit(domain);
1872                 goto error;
1873         }
1874
1875         if (domain_init(domain, gaw)) {
1876                 domain_exit(domain);
1877                 goto error;
1878         }
1879
1880         /* register pcie-to-pci device */
1881         if (dev_tmp) {
1882                 info = alloc_devinfo_mem();
1883                 if (!info) {
1884                         domain_exit(domain);
1885                         goto error;
1886                 }
1887                 info->segment = segment;
1888                 info->bus = bus;
1889                 info->devfn = devfn;
1890                 info->dev = NULL;
1891                 info->domain = domain;
1892                 /* This domain is shared by devices under p2p bridge */
1893                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1894
1895                 /* pcie-to-pci bridge already has a domain, uses it */
1896                 found = NULL;
1897                 spin_lock_irqsave(&device_domain_lock, flags);
1898                 list_for_each_entry(tmp, &device_domain_list, global) {
1899                         if (tmp->segment == segment &&
1900                             tmp->bus == bus && tmp->devfn == devfn) {
1901                                 found = tmp->domain;
1902                                 break;
1903                         }
1904                 }
1905                 if (found) {
1906                         free_devinfo_mem(info);
1907                         domain_exit(domain);
1908                         domain = found;
1909                 } else {
1910                         list_add(&info->link, &domain->devices);
1911                         list_add(&info->global, &device_domain_list);
1912                 }
1913                 spin_unlock_irqrestore(&device_domain_lock, flags);
1914         }
1915
1916 found_domain:
1917         info = alloc_devinfo_mem();
1918         if (!info)
1919                 goto error;
1920         info->segment = segment;
1921         info->bus = pdev->bus->number;
1922         info->devfn = pdev->devfn;
1923         info->dev = pdev;
1924         info->domain = domain;
1925         spin_lock_irqsave(&device_domain_lock, flags);
1926         /* somebody is fast */
1927         found = find_domain(pdev);
1928         if (found != NULL) {
1929                 spin_unlock_irqrestore(&device_domain_lock, flags);
1930                 if (found != domain) {
1931                         domain_exit(domain);
1932                         domain = found;
1933                 }
1934                 free_devinfo_mem(info);
1935                 return domain;
1936         }
1937         list_add(&info->link, &domain->devices);
1938         list_add(&info->global, &device_domain_list);
1939         pdev->dev.archdata.iommu = info;
1940         spin_unlock_irqrestore(&device_domain_lock, flags);
1941         return domain;
1942 error:
1943         /* recheck it here, maybe others set it */
1944         return find_domain(pdev);
1945 }
1946
1947 static int iommu_identity_mapping;
1948 #define IDENTMAP_ALL            1
1949 #define IDENTMAP_GFX            2
1950 #define IDENTMAP_AZALIA         4
1951
1952 static int iommu_domain_identity_map(struct dmar_domain *domain,
1953                                      unsigned long long start,
1954                                      unsigned long long end)
1955 {
1956         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1957         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1958
1959         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1960                           dma_to_mm_pfn(last_vpfn))) {
1961                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1962                 return -ENOMEM;
1963         }
1964
1965         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1966                  start, end, domain->id);
1967         /*
1968          * RMRR range might have overlap with physical memory range,
1969          * clear it first
1970          */
1971         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1972
1973         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1974                                   last_vpfn - first_vpfn + 1,
1975                                   DMA_PTE_READ|DMA_PTE_WRITE);
1976 }
1977
1978 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1979                                       unsigned long long start,
1980                                       unsigned long long end)
1981 {
1982         struct dmar_domain *domain;
1983         int ret;
1984
1985         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1986         if (!domain)
1987                 return -ENOMEM;
1988
1989         /* For _hardware_ passthrough, don't bother. But for software
1990            passthrough, we do it anyway -- it may indicate a memory
1991            range which is reserved in E820, so which didn't get set
1992            up to start with in si_domain */
1993         if (domain == si_domain && hw_pass_through) {
1994                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1995                        pci_name(pdev), start, end);
1996                 return 0;
1997         }
1998
1999         printk(KERN_INFO
2000                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2001                pci_name(pdev), start, end);
2002         
2003         if (end >> agaw_to_width(domain->agaw)) {
2004                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2005                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2006                      agaw_to_width(domain->agaw),
2007                      dmi_get_system_info(DMI_BIOS_VENDOR),
2008                      dmi_get_system_info(DMI_BIOS_VERSION),
2009                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2010                 ret = -EIO;
2011                 goto error;
2012         }
2013
2014         ret = iommu_domain_identity_map(domain, start, end);
2015         if (ret)
2016                 goto error;
2017
2018         /* context entry init */
2019         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2020         if (ret)
2021                 goto error;
2022
2023         return 0;
2024
2025  error:
2026         domain_exit(domain);
2027         return ret;
2028 }
2029
2030 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2031         struct pci_dev *pdev)
2032 {
2033         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2034                 return 0;
2035         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2036                 rmrr->end_address + 1);
2037 }
2038
2039 #ifdef CONFIG_DMAR_FLOPPY_WA
2040 static inline void iommu_prepare_isa(void)
2041 {
2042         struct pci_dev *pdev;
2043         int ret;
2044
2045         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2046         if (!pdev)
2047                 return;
2048
2049         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2050         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2051
2052         if (ret)
2053                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2054                        "floppy might not work\n");
2055
2056 }
2057 #else
2058 static inline void iommu_prepare_isa(void)
2059 {
2060         return;
2061 }
2062 #endif /* !CONFIG_DMAR_FLPY_WA */
2063
2064 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2065
2066 static int __init si_domain_work_fn(unsigned long start_pfn,
2067                                     unsigned long end_pfn, void *datax)
2068 {
2069         int *ret = datax;
2070
2071         *ret = iommu_domain_identity_map(si_domain,
2072                                          (uint64_t)start_pfn << PAGE_SHIFT,
2073                                          (uint64_t)end_pfn << PAGE_SHIFT);
2074         return *ret;
2075
2076 }
2077
2078 static int __init si_domain_init(int hw)
2079 {
2080         struct dmar_drhd_unit *drhd;
2081         struct intel_iommu *iommu;
2082         int nid, ret = 0;
2083
2084         si_domain = alloc_domain();
2085         if (!si_domain)
2086                 return -EFAULT;
2087
2088         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2089
2090         for_each_active_iommu(iommu, drhd) {
2091                 ret = iommu_attach_domain(si_domain, iommu);
2092                 if (ret) {
2093                         domain_exit(si_domain);
2094                         return -EFAULT;
2095                 }
2096         }
2097
2098         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2099                 domain_exit(si_domain);
2100                 return -EFAULT;
2101         }
2102
2103         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2104
2105         if (hw)
2106                 return 0;
2107
2108         for_each_online_node(nid) {
2109                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2110                 if (ret)
2111                         return ret;
2112         }
2113
2114         return 0;
2115 }
2116
2117 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2118                                           struct pci_dev *pdev);
2119 static int identity_mapping(struct pci_dev *pdev)
2120 {
2121         struct device_domain_info *info;
2122
2123         if (likely(!iommu_identity_mapping))
2124                 return 0;
2125
2126
2127         list_for_each_entry(info, &si_domain->devices, link)
2128                 if (info->dev == pdev)
2129                         return 1;
2130         return 0;
2131 }
2132
2133 static int domain_add_dev_info(struct dmar_domain *domain,
2134                                struct pci_dev *pdev,
2135                                int translation)
2136 {
2137         struct device_domain_info *info;
2138         unsigned long flags;
2139         int ret;
2140
2141         info = alloc_devinfo_mem();
2142         if (!info)
2143                 return -ENOMEM;
2144
2145         ret = domain_context_mapping(domain, pdev, translation);
2146         if (ret) {
2147                 free_devinfo_mem(info);
2148                 return ret;
2149         }
2150
2151         info->segment = pci_domain_nr(pdev->bus);
2152         info->bus = pdev->bus->number;
2153         info->devfn = pdev->devfn;
2154         info->dev = pdev;
2155         info->domain = domain;
2156
2157         spin_lock_irqsave(&device_domain_lock, flags);
2158         list_add(&info->link, &domain->devices);
2159         list_add(&info->global, &device_domain_list);
2160         pdev->dev.archdata.iommu = info;
2161         spin_unlock_irqrestore(&device_domain_lock, flags);
2162
2163         return 0;
2164 }
2165
2166 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2167 {
2168         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2169                 return 1;
2170
2171         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2172                 return 1;
2173
2174         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2175                 return 0;
2176
2177         /*
2178          * We want to start off with all devices in the 1:1 domain, and
2179          * take them out later if we find they can't access all of memory.
2180          *
2181          * However, we can't do this for PCI devices behind bridges,
2182          * because all PCI devices behind the same bridge will end up
2183          * with the same source-id on their transactions.
2184          *
2185          * Practically speaking, we can't change things around for these
2186          * devices at run-time, because we can't be sure there'll be no
2187          * DMA transactions in flight for any of their siblings.
2188          * 
2189          * So PCI devices (unless they're on the root bus) as well as
2190          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2191          * the 1:1 domain, just in _case_ one of their siblings turns out
2192          * not to be able to map all of memory.
2193          */
2194         if (!pdev->is_pcie) {
2195                 if (!pci_is_root_bus(pdev->bus))
2196                         return 0;
2197                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2198                         return 0;
2199         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2200                 return 0;
2201
2202         /* 
2203          * At boot time, we don't yet know if devices will be 64-bit capable.
2204          * Assume that they will -- if they turn out not to be, then we can 
2205          * take them out of the 1:1 domain later.
2206          */
2207         if (!startup)
2208                 return pdev->dma_mask > DMA_BIT_MASK(32);
2209
2210         return 1;
2211 }
2212
2213 static int __init iommu_prepare_static_identity_mapping(int hw)
2214 {
2215         struct pci_dev *pdev = NULL;
2216         int ret;
2217
2218         ret = si_domain_init(hw);
2219         if (ret)
2220                 return -EFAULT;
2221
2222         for_each_pci_dev(pdev) {
2223                 if (iommu_should_identity_map(pdev, 1)) {
2224                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2225                                hw ? "hardware" : "software", pci_name(pdev));
2226
2227                         ret = domain_add_dev_info(si_domain, pdev,
2228                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2229                                                      CONTEXT_TT_MULTI_LEVEL);
2230                         if (ret)
2231                                 return ret;
2232                 }
2233         }
2234
2235         return 0;
2236 }
2237
2238 int __init init_dmars(void)
2239 {
2240         struct dmar_drhd_unit *drhd;
2241         struct dmar_rmrr_unit *rmrr;
2242         struct pci_dev *pdev;
2243         struct intel_iommu *iommu;
2244         int i, ret;
2245
2246         /*
2247          * for each drhd
2248          *    allocate root
2249          *    initialize and program root entry to not present
2250          * endfor
2251          */
2252         for_each_drhd_unit(drhd) {
2253                 g_num_of_iommus++;
2254                 /*
2255                  * lock not needed as this is only incremented in the single
2256                  * threaded kernel __init code path all other access are read
2257                  * only
2258                  */
2259         }
2260
2261         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2262                         GFP_KERNEL);
2263         if (!g_iommus) {
2264                 printk(KERN_ERR "Allocating global iommu array failed\n");
2265                 ret = -ENOMEM;
2266                 goto error;
2267         }
2268
2269         deferred_flush = kzalloc(g_num_of_iommus *
2270                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2271         if (!deferred_flush) {
2272                 ret = -ENOMEM;
2273                 goto error;
2274         }
2275
2276         for_each_drhd_unit(drhd) {
2277                 if (drhd->ignored)
2278                         continue;
2279
2280                 iommu = drhd->iommu;
2281                 g_iommus[iommu->seq_id] = iommu;
2282
2283                 ret = iommu_init_domains(iommu);
2284                 if (ret)
2285                         goto error;
2286
2287                 /*
2288                  * TBD:
2289                  * we could share the same root & context tables
2290                  * amoung all IOMMU's. Need to Split it later.
2291                  */
2292                 ret = iommu_alloc_root_entry(iommu);
2293                 if (ret) {
2294                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2295                         goto error;
2296                 }
2297                 if (!ecap_pass_through(iommu->ecap))
2298                         hw_pass_through = 0;
2299         }
2300
2301         /*
2302          * Start from the sane iommu hardware state.
2303          */
2304         for_each_drhd_unit(drhd) {
2305                 if (drhd->ignored)
2306                         continue;
2307
2308                 iommu = drhd->iommu;
2309
2310                 /*
2311                  * If the queued invalidation is already initialized by us
2312                  * (for example, while enabling interrupt-remapping) then
2313                  * we got the things already rolling from a sane state.
2314                  */
2315                 if (iommu->qi)
2316                         continue;
2317
2318                 /*
2319                  * Clear any previous faults.
2320                  */
2321                 dmar_fault(-1, iommu);
2322                 /*
2323                  * Disable queued invalidation if supported and already enabled
2324                  * before OS handover.
2325                  */
2326                 dmar_disable_qi(iommu);
2327         }
2328
2329         for_each_drhd_unit(drhd) {
2330                 if (drhd->ignored)
2331                         continue;
2332
2333                 iommu = drhd->iommu;
2334
2335                 if (dmar_enable_qi(iommu)) {
2336                         /*
2337                          * Queued Invalidate not enabled, use Register Based
2338                          * Invalidate
2339                          */
2340                         iommu->flush.flush_context = __iommu_flush_context;
2341                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2342                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2343                                "invalidation\n",
2344                                (unsigned long long)drhd->reg_base_addr);
2345                 } else {
2346                         iommu->flush.flush_context = qi_flush_context;
2347                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2348                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2349                                "invalidation\n",
2350                                (unsigned long long)drhd->reg_base_addr);
2351                 }
2352         }
2353
2354         if (iommu_pass_through)
2355                 iommu_identity_mapping |= IDENTMAP_ALL;
2356
2357 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2358         iommu_identity_mapping |= IDENTMAP_GFX;
2359 #endif
2360
2361         check_tylersburg_isoch();
2362
2363         /*
2364          * If pass through is not set or not enabled, setup context entries for
2365          * identity mappings for rmrr, gfx, and isa and may fall back to static
2366          * identity mapping if iommu_identity_mapping is set.
2367          */
2368         if (iommu_identity_mapping) {
2369                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2370                 if (ret) {
2371                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2372                         goto error;
2373                 }
2374         }
2375         /*
2376          * For each rmrr
2377          *   for each dev attached to rmrr
2378          *   do
2379          *     locate drhd for dev, alloc domain for dev
2380          *     allocate free domain
2381          *     allocate page table entries for rmrr
2382          *     if context not allocated for bus
2383          *           allocate and init context
2384          *           set present in root table for this bus
2385          *     init context with domain, translation etc
2386          *    endfor
2387          * endfor
2388          */
2389         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2390         for_each_rmrr_units(rmrr) {
2391                 for (i = 0; i < rmrr->devices_cnt; i++) {
2392                         pdev = rmrr->devices[i];
2393                         /*
2394                          * some BIOS lists non-exist devices in DMAR
2395                          * table.
2396                          */
2397                         if (!pdev)
2398                                 continue;
2399                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2400                         if (ret)
2401                                 printk(KERN_ERR
2402                                        "IOMMU: mapping reserved region failed\n");
2403                 }
2404         }
2405
2406         iommu_prepare_isa();
2407
2408         /*
2409          * for each drhd
2410          *   enable fault log
2411          *   global invalidate context cache
2412          *   global invalidate iotlb
2413          *   enable translation
2414          */
2415         for_each_drhd_unit(drhd) {
2416                 if (drhd->ignored)
2417                         continue;
2418                 iommu = drhd->iommu;
2419
2420                 iommu_flush_write_buffer(iommu);
2421
2422                 ret = dmar_set_interrupt(iommu);
2423                 if (ret)
2424                         goto error;
2425
2426                 iommu_set_root_entry(iommu);
2427
2428                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2429                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2430
2431                 ret = iommu_enable_translation(iommu);
2432                 if (ret)
2433                         goto error;
2434
2435                 iommu_disable_protect_mem_regions(iommu);
2436         }
2437
2438         return 0;
2439 error:
2440         for_each_drhd_unit(drhd) {
2441                 if (drhd->ignored)
2442                         continue;
2443                 iommu = drhd->iommu;
2444                 free_iommu(iommu);
2445         }
2446         kfree(g_iommus);
2447         return ret;
2448 }
2449
2450 /* This takes a number of _MM_ pages, not VTD pages */
2451 static struct iova *intel_alloc_iova(struct device *dev,
2452                                      struct dmar_domain *domain,
2453                                      unsigned long nrpages, uint64_t dma_mask)
2454 {
2455         struct pci_dev *pdev = to_pci_dev(dev);
2456         struct iova *iova = NULL;
2457
2458         /* Restrict dma_mask to the width that the iommu can handle */
2459         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2460
2461         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2462                 /*
2463                  * First try to allocate an io virtual address in
2464                  * DMA_BIT_MASK(32) and if that fails then try allocating
2465                  * from higher range
2466                  */
2467                 iova = alloc_iova(&domain->iovad, nrpages,
2468                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2469                 if (iova)
2470                         return iova;
2471         }
2472         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2473         if (unlikely(!iova)) {
2474                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2475                        nrpages, pci_name(pdev));
2476                 return NULL;
2477         }
2478
2479         return iova;
2480 }
2481
2482 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2483 {
2484         struct dmar_domain *domain;
2485         int ret;
2486
2487         domain = get_domain_for_dev(pdev,
2488                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2489         if (!domain) {
2490                 printk(KERN_ERR
2491                         "Allocating domain for %s failed", pci_name(pdev));
2492                 return NULL;
2493         }
2494
2495         /* make sure context mapping is ok */
2496         if (unlikely(!domain_context_mapped(pdev))) {
2497                 ret = domain_context_mapping(domain, pdev,
2498                                              CONTEXT_TT_MULTI_LEVEL);
2499                 if (ret) {
2500                         printk(KERN_ERR
2501                                 "Domain context map for %s failed",
2502                                 pci_name(pdev));
2503                         return NULL;
2504                 }
2505         }
2506
2507         return domain;
2508 }
2509
2510 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2511 {
2512         struct device_domain_info *info;
2513
2514         /* No lock here, assumes no domain exit in normal case */
2515         info = dev->dev.archdata.iommu;
2516         if (likely(info))
2517                 return info->domain;
2518
2519         return __get_valid_domain_for_dev(dev);
2520 }
2521
2522 static int iommu_dummy(struct pci_dev *pdev)
2523 {
2524         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2525 }
2526
2527 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2528 static int iommu_no_mapping(struct device *dev)
2529 {
2530         struct pci_dev *pdev;
2531         int found;
2532
2533         if (unlikely(dev->bus != &pci_bus_type))
2534                 return 1;
2535
2536         pdev = to_pci_dev(dev);
2537         if (iommu_dummy(pdev))
2538                 return 1;
2539
2540         if (!iommu_identity_mapping)
2541                 return 0;
2542
2543         found = identity_mapping(pdev);
2544         if (found) {
2545                 if (iommu_should_identity_map(pdev, 0))
2546                         return 1;
2547                 else {
2548                         /*
2549                          * 32 bit DMA is removed from si_domain and fall back
2550                          * to non-identity mapping.
2551                          */
2552                         domain_remove_one_dev_info(si_domain, pdev);
2553                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2554                                pci_name(pdev));
2555                         return 0;
2556                 }
2557         } else {
2558                 /*
2559                  * In case of a detached 64 bit DMA device from vm, the device
2560                  * is put into si_domain for identity mapping.
2561                  */
2562                 if (iommu_should_identity_map(pdev, 0)) {
2563                         int ret;
2564                         ret = domain_add_dev_info(si_domain, pdev,
2565                                                   hw_pass_through ?
2566                                                   CONTEXT_TT_PASS_THROUGH :
2567                                                   CONTEXT_TT_MULTI_LEVEL);
2568                         if (!ret) {
2569                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2570                                        pci_name(pdev));
2571                                 return 1;
2572                         }
2573                 }
2574         }
2575
2576         return 0;
2577 }
2578
2579 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2580                                      size_t size, int dir, u64 dma_mask)
2581 {
2582         struct pci_dev *pdev = to_pci_dev(hwdev);
2583         struct dmar_domain *domain;
2584         phys_addr_t start_paddr;
2585         struct iova *iova;
2586         int prot = 0;
2587         int ret;
2588         struct intel_iommu *iommu;
2589         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2590
2591         BUG_ON(dir == DMA_NONE);
2592
2593         if (iommu_no_mapping(hwdev))
2594                 return paddr;
2595
2596         domain = get_valid_domain_for_dev(pdev);
2597         if (!domain)
2598                 return 0;
2599
2600         iommu = domain_get_iommu(domain);
2601         size = aligned_nrpages(paddr, size);
2602
2603         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2604                                 pdev->dma_mask);
2605         if (!iova)
2606                 goto error;
2607
2608         /*
2609          * Check if DMAR supports zero-length reads on write only
2610          * mappings..
2611          */
2612         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2613                         !cap_zlr(iommu->cap))
2614                 prot |= DMA_PTE_READ;
2615         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2616                 prot |= DMA_PTE_WRITE;
2617         /*
2618          * paddr - (paddr + size) might be partial page, we should map the whole
2619          * page.  Note: if two part of one page are separately mapped, we
2620          * might have two guest_addr mapping to the same host paddr, but this
2621          * is not a big problem
2622          */
2623         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2624                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2625         if (ret)
2626                 goto error;
2627
2628         /* it's a non-present to present mapping. Only flush if caching mode */
2629         if (cap_caching_mode(iommu->cap))
2630                 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2631         else
2632                 iommu_flush_write_buffer(iommu);
2633
2634         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2635         start_paddr += paddr & ~PAGE_MASK;
2636         return start_paddr;
2637
2638 error:
2639         if (iova)
2640                 __free_iova(&domain->iovad, iova);
2641         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2642                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2643         return 0;
2644 }
2645
2646 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2647                                  unsigned long offset, size_t size,
2648                                  enum dma_data_direction dir,
2649                                  struct dma_attrs *attrs)
2650 {
2651         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2652                                   dir, to_pci_dev(dev)->dma_mask);
2653 }
2654
2655 static void flush_unmaps(void)
2656 {
2657         int i, j;
2658
2659         timer_on = 0;
2660
2661         /* just flush them all */
2662         for (i = 0; i < g_num_of_iommus; i++) {
2663                 struct intel_iommu *iommu = g_iommus[i];
2664                 if (!iommu)
2665                         continue;
2666
2667                 if (!deferred_flush[i].next)
2668                         continue;
2669
2670                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2671                                          DMA_TLB_GLOBAL_FLUSH);
2672                 for (j = 0; j < deferred_flush[i].next; j++) {
2673                         unsigned long mask;
2674                         struct iova *iova = deferred_flush[i].iova[j];
2675
2676                         mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2677                         iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2678                                         (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2679                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680                 }
2681                 deferred_flush[i].next = 0;
2682         }
2683
2684         list_size = 0;
2685 }
2686
2687 static void flush_unmaps_timeout(unsigned long data)
2688 {
2689         unsigned long flags;
2690
2691         spin_lock_irqsave(&async_umap_flush_lock, flags);
2692         flush_unmaps();
2693         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694 }
2695
2696 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697 {
2698         unsigned long flags;
2699         int next, iommu_id;
2700         struct intel_iommu *iommu;
2701
2702         spin_lock_irqsave(&async_umap_flush_lock, flags);
2703         if (list_size == HIGH_WATER_MARK)
2704                 flush_unmaps();
2705
2706         iommu = domain_get_iommu(dom);
2707         iommu_id = iommu->seq_id;
2708
2709         next = deferred_flush[iommu_id].next;
2710         deferred_flush[iommu_id].domain[next] = dom;
2711         deferred_flush[iommu_id].iova[next] = iova;
2712         deferred_flush[iommu_id].next++;
2713
2714         if (!timer_on) {
2715                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716                 timer_on = 1;
2717         }
2718         list_size++;
2719         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720 }
2721
2722 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723                              size_t size, enum dma_data_direction dir,
2724                              struct dma_attrs *attrs)
2725 {
2726         struct pci_dev *pdev = to_pci_dev(dev);
2727         struct dmar_domain *domain;
2728         unsigned long start_pfn, last_pfn;
2729         struct iova *iova;
2730         struct intel_iommu *iommu;
2731
2732         if (iommu_no_mapping(dev))
2733                 return;
2734
2735         domain = find_domain(pdev);
2736         BUG_ON(!domain);
2737
2738         iommu = domain_get_iommu(domain);
2739
2740         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742                       (unsigned long long)dev_addr))
2743                 return;
2744
2745         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747
2748         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749                  pci_name(pdev), start_pfn, last_pfn);
2750
2751         /*  clear the whole page */
2752         dma_pte_clear_range(domain, start_pfn, last_pfn);
2753
2754         /* free page tables */
2755         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756
2757         if (intel_iommu_strict) {
2758                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759                                       last_pfn - start_pfn + 1);
2760                 /* free iova */
2761                 __free_iova(&domain->iovad, iova);
2762         } else {
2763                 add_unmap(domain, iova);
2764                 /*
2765                  * queue up the release of the unmap to save the 1/6th of the
2766                  * cpu used up by the iotlb flush operation...
2767                  */
2768         }
2769 }
2770
2771 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772                                   dma_addr_t *dma_handle, gfp_t flags)
2773 {
2774         void *vaddr;
2775         int order;
2776
2777         size = PAGE_ALIGN(size);
2778         order = get_order(size);
2779
2780         if (!iommu_no_mapping(hwdev))
2781                 flags &= ~(GFP_DMA | GFP_DMA32);
2782         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2783                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2784                         flags |= GFP_DMA;
2785                 else
2786                         flags |= GFP_DMA32;
2787         }
2788
2789         vaddr = (void *)__get_free_pages(flags, order);
2790         if (!vaddr)
2791                 return NULL;
2792         memset(vaddr, 0, size);
2793
2794         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2795                                          DMA_BIDIRECTIONAL,
2796                                          hwdev->coherent_dma_mask);
2797         if (*dma_handle)
2798                 return vaddr;
2799         free_pages((unsigned long)vaddr, order);
2800         return NULL;
2801 }
2802
2803 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2804                                 dma_addr_t dma_handle)
2805 {
2806         int order;
2807
2808         size = PAGE_ALIGN(size);
2809         order = get_order(size);
2810
2811         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2812         free_pages((unsigned long)vaddr, order);
2813 }
2814
2815 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2816                            int nelems, enum dma_data_direction dir,
2817                            struct dma_attrs *attrs)
2818 {
2819         struct pci_dev *pdev = to_pci_dev(hwdev);
2820         struct dmar_domain *domain;
2821         unsigned long start_pfn, last_pfn;
2822         struct iova *iova;
2823         struct intel_iommu *iommu;
2824
2825         if (iommu_no_mapping(hwdev))
2826                 return;
2827
2828         domain = find_domain(pdev);
2829         BUG_ON(!domain);
2830
2831         iommu = domain_get_iommu(domain);
2832
2833         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2834         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2835                       (unsigned long long)sglist[0].dma_address))
2836                 return;
2837
2838         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2839         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2840
2841         /*  clear the whole page */
2842         dma_pte_clear_range(domain, start_pfn, last_pfn);
2843
2844         /* free page tables */
2845         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2846
2847         if (intel_iommu_strict) {
2848                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2849                                       last_pfn - start_pfn + 1);
2850                 /* free iova */
2851                 __free_iova(&domain->iovad, iova);
2852         } else {
2853                 add_unmap(domain, iova);
2854                 /*
2855                  * queue up the release of the unmap to save the 1/6th of the
2856                  * cpu used up by the iotlb flush operation...
2857                  */
2858         }
2859 }
2860
2861 static int intel_nontranslate_map_sg(struct device *hddev,
2862         struct scatterlist *sglist, int nelems, int dir)
2863 {
2864         int i;
2865         struct scatterlist *sg;
2866
2867         for_each_sg(sglist, sg, nelems, i) {
2868                 BUG_ON(!sg_page(sg));
2869                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2870                 sg->dma_length = sg->length;
2871         }
2872         return nelems;
2873 }
2874
2875 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2876                         enum dma_data_direction dir, struct dma_attrs *attrs)
2877 {
2878         int i;
2879         struct pci_dev *pdev = to_pci_dev(hwdev);
2880         struct dmar_domain *domain;
2881         size_t size = 0;
2882         int prot = 0;
2883         size_t offset_pfn = 0;
2884         struct iova *iova = NULL;
2885         int ret;
2886         struct scatterlist *sg;
2887         unsigned long start_vpfn;
2888         struct intel_iommu *iommu;
2889
2890         BUG_ON(dir == DMA_NONE);
2891         if (iommu_no_mapping(hwdev))
2892                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2893
2894         domain = get_valid_domain_for_dev(pdev);
2895         if (!domain)
2896                 return 0;
2897
2898         iommu = domain_get_iommu(domain);
2899
2900         for_each_sg(sglist, sg, nelems, i)
2901                 size += aligned_nrpages(sg->offset, sg->length);
2902
2903         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2904                                 pdev->dma_mask);
2905         if (!iova) {
2906                 sglist->dma_length = 0;
2907                 return 0;
2908         }
2909
2910         /*
2911          * Check if DMAR supports zero-length reads on write only
2912          * mappings..
2913          */
2914         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2915                         !cap_zlr(iommu->cap))
2916                 prot |= DMA_PTE_READ;
2917         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2918                 prot |= DMA_PTE_WRITE;
2919
2920         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2921
2922         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2923         if (unlikely(ret)) {
2924                 /*  clear the page */
2925                 dma_pte_clear_range(domain, start_vpfn,
2926                                     start_vpfn + size - 1);
2927                 /* free page tables */
2928                 dma_pte_free_pagetable(domain, start_vpfn,
2929                                        start_vpfn + size - 1);
2930                 /* free iova */
2931                 __free_iova(&domain->iovad, iova);
2932                 return 0;
2933         }
2934
2935         /* it's a non-present to present mapping. Only flush if caching mode */
2936         if (cap_caching_mode(iommu->cap))
2937                 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2938         else
2939                 iommu_flush_write_buffer(iommu);
2940
2941         return nelems;
2942 }
2943
2944 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2945 {
2946         return !dma_addr;
2947 }
2948
2949 struct dma_map_ops intel_dma_ops = {
2950         .alloc_coherent = intel_alloc_coherent,
2951         .free_coherent = intel_free_coherent,
2952         .map_sg = intel_map_sg,
2953         .unmap_sg = intel_unmap_sg,
2954         .map_page = intel_map_page,
2955         .unmap_page = intel_unmap_page,
2956         .mapping_error = intel_mapping_error,
2957 };
2958
2959 static inline int iommu_domain_cache_init(void)
2960 {
2961         int ret = 0;
2962
2963         iommu_domain_cache = kmem_cache_create("iommu_domain",
2964                                          sizeof(struct dmar_domain),
2965                                          0,
2966                                          SLAB_HWCACHE_ALIGN,
2967
2968                                          NULL);
2969         if (!iommu_domain_cache) {
2970                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2971                 ret = -ENOMEM;
2972         }
2973
2974         return ret;
2975 }
2976
2977 static inline int iommu_devinfo_cache_init(void)
2978 {
2979         int ret = 0;
2980
2981         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2982                                          sizeof(struct device_domain_info),
2983                                          0,
2984                                          SLAB_HWCACHE_ALIGN,
2985                                          NULL);
2986         if (!iommu_devinfo_cache) {
2987                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2988                 ret = -ENOMEM;
2989         }
2990
2991         return ret;
2992 }
2993
2994 static inline int iommu_iova_cache_init(void)
2995 {
2996         int ret = 0;
2997
2998         iommu_iova_cache = kmem_cache_create("iommu_iova",
2999                                          sizeof(struct iova),
3000                                          0,
3001                                          SLAB_HWCACHE_ALIGN,
3002                                          NULL);
3003         if (!iommu_iova_cache) {
3004                 printk(KERN_ERR "Couldn't create iova cache\n");
3005                 ret = -ENOMEM;
3006         }
3007
3008         return ret;
3009 }
3010
3011 static int __init iommu_init_mempool(void)
3012 {
3013         int ret;
3014         ret = iommu_iova_cache_init();
3015         if (ret)
3016                 return ret;
3017
3018         ret = iommu_domain_cache_init();
3019         if (ret)
3020                 goto domain_error;
3021
3022         ret = iommu_devinfo_cache_init();
3023         if (!ret)
3024                 return ret;
3025
3026         kmem_cache_destroy(iommu_domain_cache);
3027 domain_error:
3028         kmem_cache_destroy(iommu_iova_cache);
3029
3030         return -ENOMEM;
3031 }
3032
3033 static void __init iommu_exit_mempool(void)
3034 {
3035         kmem_cache_destroy(iommu_devinfo_cache);
3036         kmem_cache_destroy(iommu_domain_cache);
3037         kmem_cache_destroy(iommu_iova_cache);
3038
3039 }
3040
3041 static void __init init_no_remapping_devices(void)
3042 {
3043         struct dmar_drhd_unit *drhd;
3044
3045         for_each_drhd_unit(drhd) {
3046                 if (!drhd->include_all) {
3047                         int i;
3048                         for (i = 0; i < drhd->devices_cnt; i++)
3049                                 if (drhd->devices[i] != NULL)
3050                                         break;
3051                         /* ignore DMAR unit if no pci devices exist */
3052                         if (i == drhd->devices_cnt)
3053                                 drhd->ignored = 1;
3054                 }
3055         }
3056
3057         if (dmar_map_gfx)
3058                 return;
3059
3060         for_each_drhd_unit(drhd) {
3061                 int i;
3062                 if (drhd->ignored || drhd->include_all)
3063                         continue;
3064
3065                 for (i = 0; i < drhd->devices_cnt; i++)
3066                         if (drhd->devices[i] &&
3067                                 !IS_GFX_DEVICE(drhd->devices[i]))
3068                                 break;
3069
3070                 if (i < drhd->devices_cnt)
3071                         continue;
3072
3073                 /* bypass IOMMU if it is just for gfx devices */
3074                 drhd->ignored = 1;
3075                 for (i = 0; i < drhd->devices_cnt; i++) {
3076                         if (!drhd->devices[i])
3077                                 continue;
3078                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3079                 }
3080         }
3081 }
3082
3083 #ifdef CONFIG_SUSPEND
3084 static int init_iommu_hw(void)
3085 {
3086         struct dmar_drhd_unit *drhd;
3087         struct intel_iommu *iommu = NULL;
3088
3089         for_each_active_iommu(iommu, drhd)
3090                 if (iommu->qi)
3091                         dmar_reenable_qi(iommu);
3092
3093         for_each_active_iommu(iommu, drhd) {
3094                 iommu_flush_write_buffer(iommu);
3095
3096                 iommu_set_root_entry(iommu);
3097
3098                 iommu->flush.flush_context(iommu, 0, 0, 0,
3099                                            DMA_CCMD_GLOBAL_INVL);
3100                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3101                                          DMA_TLB_GLOBAL_FLUSH);
3102                 iommu_enable_translation(iommu);
3103                 iommu_disable_protect_mem_regions(iommu);
3104         }
3105
3106         return 0;
3107 }
3108
3109 static void iommu_flush_all(void)
3110 {
3111         struct dmar_drhd_unit *drhd;
3112         struct intel_iommu *iommu;
3113
3114         for_each_active_iommu(iommu, drhd) {
3115                 iommu->flush.flush_context(iommu, 0, 0, 0,
3116                                            DMA_CCMD_GLOBAL_INVL);
3117                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3118                                          DMA_TLB_GLOBAL_FLUSH);
3119         }
3120 }
3121
3122 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3123 {
3124         struct dmar_drhd_unit *drhd;
3125         struct intel_iommu *iommu = NULL;
3126         unsigned long flag;
3127
3128         for_each_active_iommu(iommu, drhd) {
3129                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3130                                                  GFP_ATOMIC);
3131                 if (!iommu->iommu_state)
3132                         goto nomem;
3133         }
3134
3135         iommu_flush_all();
3136
3137         for_each_active_iommu(iommu, drhd) {
3138                 iommu_disable_translation(iommu);
3139
3140                 spin_lock_irqsave(&iommu->register_lock, flag);
3141
3142                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3143                         readl(iommu->reg + DMAR_FECTL_REG);
3144                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3145                         readl(iommu->reg + DMAR_FEDATA_REG);
3146                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3147                         readl(iommu->reg + DMAR_FEADDR_REG);
3148                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3149                         readl(iommu->reg + DMAR_FEUADDR_REG);
3150
3151                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3152         }
3153         return 0;
3154
3155 nomem:
3156         for_each_active_iommu(iommu, drhd)
3157                 kfree(iommu->iommu_state);
3158
3159         return -ENOMEM;
3160 }
3161
3162 static int iommu_resume(struct sys_device *dev)
3163 {
3164         struct dmar_drhd_unit *drhd;
3165         struct intel_iommu *iommu = NULL;
3166         unsigned long flag;
3167
3168         if (init_iommu_hw()) {
3169                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3170                 return -EIO;
3171         }
3172
3173         for_each_active_iommu(iommu, drhd) {
3174
3175                 spin_lock_irqsave(&iommu->register_lock, flag);
3176
3177                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3178                         iommu->reg + DMAR_FECTL_REG);
3179                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3180                         iommu->reg + DMAR_FEDATA_REG);
3181                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3182                         iommu->reg + DMAR_FEADDR_REG);
3183                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3184                         iommu->reg + DMAR_FEUADDR_REG);
3185
3186                 spin_unlock_irqrestore(&iommu->register_lock, flag);
3187         }
3188
3189         for_each_active_iommu(iommu, drhd)
3190                 kfree(iommu->iommu_state);
3191
3192         return 0;
3193 }
3194
3195 static struct sysdev_class iommu_sysclass = {
3196         .name           = "iommu",
3197         .resume         = iommu_resume,
3198         .suspend        = iommu_suspend,
3199 };
3200
3201 static struct sys_device device_iommu = {
3202         .cls    = &iommu_sysclass,
3203 };
3204
3205 static int __init init_iommu_sysfs(void)
3206 {
3207         int error;
3208
3209         error = sysdev_class_register(&iommu_sysclass);
3210         if (error)
3211                 return error;
3212
3213         error = sysdev_register(&device_iommu);
3214         if (error)
3215                 sysdev_class_unregister(&iommu_sysclass);
3216
3217         return error;
3218 }
3219
3220 #else
3221 static int __init init_iommu_sysfs(void)
3222 {
3223         return 0;
3224 }
3225 #endif  /* CONFIG_PM */
3226
3227 /*
3228  * Here we only respond to action of unbound device from driver.
3229  *
3230  * Added device is not attached to its DMAR domain here yet. That will happen
3231  * when mapping the device to iova.
3232  */
3233 static int device_notifier(struct notifier_block *nb,
3234                                   unsigned long action, void *data)
3235 {
3236         struct device *dev = data;
3237         struct pci_dev *pdev = to_pci_dev(dev);
3238         struct dmar_domain *domain;
3239
3240         domain = find_domain(pdev);
3241         if (!domain)
3242                 return 0;
3243
3244         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3245                 domain_remove_one_dev_info(domain, pdev);
3246
3247         return 0;
3248 }
3249
3250 static struct notifier_block device_nb = {
3251         .notifier_call = device_notifier,
3252 };
3253
3254 int __init intel_iommu_init(void)
3255 {
3256         int ret = 0;
3257         int force_on = 0;
3258
3259         /* VT-d is required for a TXT/tboot launch, so enforce that */
3260         force_on = tboot_force_iommu();
3261
3262         if (dmar_table_init()) {
3263                 if (force_on)
3264                         panic("tboot: Failed to initialize DMAR table\n");
3265                 return  -ENODEV;
3266         }
3267
3268         if (dmar_dev_scope_init()) {
3269                 if (force_on)
3270                         panic("tboot: Failed to initialize DMAR device scope\n");
3271                 return  -ENODEV;
3272         }
3273
3274         /*
3275          * Check the need for DMA-remapping initialization now.
3276          * Above initialization will also be used by Interrupt-remapping.
3277          */
3278         if (no_iommu || dmar_disabled)
3279                 return -ENODEV;
3280
3281         iommu_init_mempool();
3282         dmar_init_reserved_ranges();
3283
3284         init_no_remapping_devices();
3285
3286         ret = init_dmars();
3287         if (ret) {
3288                 if (force_on)
3289                         panic("tboot: Failed to initialize DMARs\n");
3290                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3291                 put_iova_domain(&reserved_iova_list);
3292                 iommu_exit_mempool();
3293                 return ret;
3294         }
3295         printk(KERN_INFO
3296         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3297
3298         init_timer(&unmap_timer);
3299 #ifdef CONFIG_SWIOTLB
3300         swiotlb = 0;
3301 #endif
3302         dma_ops = &intel_dma_ops;
3303
3304         init_iommu_sysfs();
3305
3306         register_iommu(&intel_iommu_ops);
3307
3308         bus_register_notifier(&pci_bus_type, &device_nb);
3309
3310         return 0;
3311 }
3312
3313 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3314                                            struct pci_dev *pdev)
3315 {
3316         struct pci_dev *tmp, *parent;
3317
3318         if (!iommu || !pdev)
3319                 return;
3320
3321         /* dependent device detach */
3322         tmp = pci_find_upstream_pcie_bridge(pdev);
3323         /* Secondary interface's bus number and devfn 0 */
3324         if (tmp) {
3325                 parent = pdev->bus->self;
3326                 while (parent != tmp) {
3327                         iommu_detach_dev(iommu, parent->bus->number,
3328                                          parent->devfn);
3329                         parent = parent->bus->self;
3330                 }
3331                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
3332                         iommu_detach_dev(iommu,
3333                                 tmp->subordinate->number, 0);
3334                 else /* this is a legacy PCI bridge */
3335                         iommu_detach_dev(iommu, tmp->bus->number,
3336                                          tmp->devfn);
3337         }
3338 }
3339
3340 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3341                                           struct pci_dev *pdev)
3342 {
3343         struct device_domain_info *info;
3344         struct intel_iommu *iommu;
3345         unsigned long flags;
3346         int found = 0;
3347         struct list_head *entry, *tmp;
3348
3349         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3350                                 pdev->devfn);
3351         if (!iommu)
3352                 return;
3353
3354         spin_lock_irqsave(&device_domain_lock, flags);
3355         list_for_each_safe(entry, tmp, &domain->devices) {
3356                 info = list_entry(entry, struct device_domain_info, link);
3357                 /* No need to compare PCI domain; it has to be the same */
3358                 if (info->bus == pdev->bus->number &&
3359                     info->devfn == pdev->devfn) {
3360                         list_del(&info->link);
3361                         list_del(&info->global);
3362                         if (info->dev)
3363                                 info->dev->dev.archdata.iommu = NULL;
3364                         spin_unlock_irqrestore(&device_domain_lock, flags);
3365
3366                         iommu_disable_dev_iotlb(info);
3367                         iommu_detach_dev(iommu, info->bus, info->devfn);
3368                         iommu_detach_dependent_devices(iommu, pdev);
3369                         free_devinfo_mem(info);
3370
3371                         spin_lock_irqsave(&device_domain_lock, flags);
3372
3373                         if (found)
3374                                 break;
3375                         else
3376                                 continue;
3377                 }
3378
3379                 /* if there is no other devices under the same iommu
3380                  * owned by this domain, clear this iommu in iommu_bmp
3381                  * update iommu count and coherency
3382                  */
3383                 if (iommu == device_to_iommu(info->segment, info->bus,
3384                                             info->devfn))
3385                         found = 1;
3386         }
3387
3388         if (found == 0) {
3389                 unsigned long tmp_flags;
3390                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3391                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3392                 domain->iommu_count--;
3393                 domain_update_iommu_cap(domain);
3394                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3395         }
3396
3397         spin_unlock_irqrestore(&device_domain_lock, flags);
3398 }
3399
3400 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3401 {
3402         struct device_domain_info *info;
3403         struct intel_iommu *iommu;
3404         unsigned long flags1, flags2;
3405
3406         spin_lock_irqsave(&device_domain_lock, flags1);
3407         while (!list_empty(&domain->devices)) {
3408                 info = list_entry(domain->devices.next,
3409                         struct device_domain_info, link);
3410                 list_del(&info->link);
3411                 list_del(&info->global);
3412                 if (info->dev)
3413                         info->dev->dev.archdata.iommu = NULL;
3414
3415                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3416
3417                 iommu_disable_dev_iotlb(info);
3418                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3419                 iommu_detach_dev(iommu, info->bus, info->devfn);
3420                 iommu_detach_dependent_devices(iommu, info->dev);
3421
3422                 /* clear this iommu in iommu_bmp, update iommu count
3423                  * and capabilities
3424                  */
3425                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3426                 if (test_and_clear_bit(iommu->seq_id,
3427                                        &domain->iommu_bmp)) {
3428                         domain->iommu_count--;
3429                         domain_update_iommu_cap(domain);
3430                 }
3431                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3432
3433                 free_devinfo_mem(info);
3434                 spin_lock_irqsave(&device_domain_lock, flags1);
3435         }
3436         spin_unlock_irqrestore(&device_domain_lock, flags1);
3437 }
3438
3439 /* domain id for virtual machine, it won't be set in context */
3440 static unsigned long vm_domid;
3441
3442 static int vm_domain_min_agaw(struct dmar_domain *domain)
3443 {
3444         int i;
3445         int min_agaw = domain->agaw;
3446
3447         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3448         for (; i < g_num_of_iommus; ) {
3449                 if (min_agaw > g_iommus[i]->agaw)
3450                         min_agaw = g_iommus[i]->agaw;
3451
3452                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3453         }
3454
3455         return min_agaw;
3456 }
3457
3458 static struct dmar_domain *iommu_alloc_vm_domain(void)
3459 {
3460         struct dmar_domain *domain;
3461
3462         domain = alloc_domain_mem();
3463         if (!domain)
3464                 return NULL;
3465
3466         domain->id = vm_domid++;
3467         domain->nid = -1;
3468         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3469         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3470
3471         return domain;
3472 }
3473
3474 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3475 {
3476         int adjust_width;
3477
3478         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3479         spin_lock_init(&domain->iommu_lock);
3480
3481         domain_reserve_special_ranges(domain);
3482
3483         /* calculate AGAW */
3484         domain->gaw = guest_width;
3485         adjust_width = guestwidth_to_adjustwidth(guest_width);
3486         domain->agaw = width_to_agaw(adjust_width);
3487
3488         INIT_LIST_HEAD(&domain->devices);
3489
3490         domain->iommu_count = 0;
3491         domain->iommu_coherency = 0;
3492         domain->iommu_snooping = 0;
3493         domain->max_addr = 0;
3494         domain->nid = -1;
3495
3496         /* always allocate the top pgd */
3497         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3498         if (!domain->pgd)
3499                 return -ENOMEM;
3500         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3501         return 0;
3502 }
3503
3504 static void iommu_free_vm_domain(struct dmar_domain *domain)
3505 {
3506         unsigned long flags;
3507         struct dmar_drhd_unit *drhd;
3508         struct intel_iommu *iommu;
3509         unsigned long i;
3510         unsigned long ndomains;
3511
3512         for_each_drhd_unit(drhd) {
3513                 if (drhd->ignored)
3514                         continue;
3515                 iommu = drhd->iommu;
3516
3517                 ndomains = cap_ndoms(iommu->cap);
3518                 i = find_first_bit(iommu->domain_ids, ndomains);
3519                 for (; i < ndomains; ) {
3520                         if (iommu->domains[i] == domain) {
3521                                 spin_lock_irqsave(&iommu->lock, flags);
3522                                 clear_bit(i, iommu->domain_ids);
3523                                 iommu->domains[i] = NULL;
3524                                 spin_unlock_irqrestore(&iommu->lock, flags);
3525                                 break;
3526                         }
3527                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3528                 }
3529         }
3530 }
3531
3532 static void vm_domain_exit(struct dmar_domain *domain)
3533 {
3534         /* Domain 0 is reserved, so dont process it */
3535         if (!domain)
3536                 return;
3537
3538         vm_domain_remove_all_dev_info(domain);
3539         /* destroy iovas */
3540         put_iova_domain(&domain->iovad);
3541
3542         /* clear ptes */
3543         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3544
3545         /* free page tables */
3546         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3547
3548         iommu_free_vm_domain(domain);
3549         free_domain_mem(domain);
3550 }
3551
3552 static int intel_iommu_domain_init(struct iommu_domain *domain)
3553 {
3554         struct dmar_domain *dmar_domain;
3555
3556         dmar_domain = iommu_alloc_vm_domain();
3557         if (!dmar_domain) {
3558                 printk(KERN_ERR
3559                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3560                 return -ENOMEM;
3561         }
3562         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3563                 printk(KERN_ERR
3564                         "intel_iommu_domain_init() failed\n");
3565                 vm_domain_exit(dmar_domain);
3566                 return -ENOMEM;
3567         }
3568         domain->priv = dmar_domain;
3569
3570         return 0;
3571 }
3572
3573 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3574 {
3575         struct dmar_domain *dmar_domain = domain->priv;
3576
3577         domain->priv = NULL;
3578         vm_domain_exit(dmar_domain);
3579 }
3580
3581 static int intel_iommu_attach_device(struct iommu_domain *domain,
3582                                      struct device *dev)
3583 {
3584         struct dmar_domain *dmar_domain = domain->priv;
3585         struct pci_dev *pdev = to_pci_dev(dev);
3586         struct intel_iommu *iommu;
3587         int addr_width;
3588         u64 end;
3589
3590         /* normally pdev is not mapped */
3591         if (unlikely(domain_context_mapped(pdev))) {
3592                 struct dmar_domain *old_domain;
3593
3594                 old_domain = find_domain(pdev);
3595                 if (old_domain) {
3596                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3597                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3598                                 domain_remove_one_dev_info(old_domain, pdev);
3599                         else
3600                                 domain_remove_dev_info(old_domain);
3601                 }
3602         }
3603
3604         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3605                                 pdev->devfn);
3606         if (!iommu)
3607                 return -ENODEV;
3608
3609         /* check if this iommu agaw is sufficient for max mapped address */
3610         addr_width = agaw_to_width(iommu->agaw);
3611         end = DOMAIN_MAX_ADDR(addr_width);
3612         end = end & VTD_PAGE_MASK;
3613         if (end < dmar_domain->max_addr) {
3614                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3615                        "sufficient for the mapped address (%llx)\n",
3616                        __func__, iommu->agaw, dmar_domain->max_addr);
3617                 return -EFAULT;
3618         }
3619
3620         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3621 }
3622
3623 static void intel_iommu_detach_device(struct iommu_domain *domain,
3624                                       struct device *dev)
3625 {
3626         struct dmar_domain *dmar_domain = domain->priv;
3627         struct pci_dev *pdev = to_pci_dev(dev);
3628
3629         domain_remove_one_dev_info(dmar_domain, pdev);
3630 }
3631
3632 static int intel_iommu_map_range(struct iommu_domain *domain,
3633                                  unsigned long iova, phys_addr_t hpa,
3634                                  size_t size, int iommu_prot)
3635 {
3636         struct dmar_domain *dmar_domain = domain->priv;
3637         u64 max_addr;
3638         int addr_width;
3639         int prot = 0;
3640         int ret;
3641
3642         if (iommu_prot & IOMMU_READ)
3643                 prot |= DMA_PTE_READ;
3644         if (iommu_prot & IOMMU_WRITE)
3645                 prot |= DMA_PTE_WRITE;
3646         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3647                 prot |= DMA_PTE_SNP;
3648
3649         max_addr = iova + size;
3650         if (dmar_domain->max_addr < max_addr) {
3651                 int min_agaw;
3652                 u64 end;
3653
3654                 /* check if minimum agaw is sufficient for mapped address */
3655                 min_agaw = vm_domain_min_agaw(dmar_domain);
3656                 addr_width = agaw_to_width(min_agaw);
3657                 end = DOMAIN_MAX_ADDR(addr_width);
3658                 end = end & VTD_PAGE_MASK;
3659                 if (end < max_addr) {
3660                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3661                                "sufficient for the mapped address (%llx)\n",
3662                                __func__, min_agaw, max_addr);
3663                         return -EFAULT;
3664                 }
3665                 dmar_domain->max_addr = max_addr;
3666         }
3667         /* Round up size to next multiple of PAGE_SIZE, if it and
3668            the low bits of hpa would take us onto the next page */
3669         size = aligned_nrpages(hpa, size);
3670         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3671                                  hpa >> VTD_PAGE_SHIFT, size, prot);
3672         return ret;
3673 }
3674
3675 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3676                                     unsigned long iova, size_t size)
3677 {
3678         struct dmar_domain *dmar_domain = domain->priv;
3679
3680         if (!size)
3681                 return;
3682
3683         dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3684                             (iova + size - 1) >> VTD_PAGE_SHIFT);
3685
3686         if (dmar_domain->max_addr == iova + size)
3687                 dmar_domain->max_addr = iova;
3688 }
3689
3690 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3691                                             unsigned long iova)
3692 {
3693         struct dmar_domain *dmar_domain = domain->priv;
3694         struct dma_pte *pte;
3695         u64 phys = 0;
3696
3697         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3698         if (pte)
3699                 phys = dma_pte_addr(pte);
3700
3701         return phys;
3702 }
3703
3704 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3705                                       unsigned long cap)
3706 {
3707         struct dmar_domain *dmar_domain = domain->priv;
3708
3709         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3710                 return dmar_domain->iommu_snooping;
3711
3712         return 0;
3713 }
3714
3715 static struct iommu_ops intel_iommu_ops = {
3716         .domain_init    = intel_iommu_domain_init,
3717         .domain_destroy = intel_iommu_domain_destroy,
3718         .attach_dev     = intel_iommu_attach_device,
3719         .detach_dev     = intel_iommu_detach_device,
3720         .map            = intel_iommu_map_range,
3721         .unmap          = intel_iommu_unmap_range,
3722         .iova_to_phys   = intel_iommu_iova_to_phys,
3723         .domain_has_cap = intel_iommu_domain_has_cap,
3724 };
3725
3726 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3727 {
3728         /*
3729          * Mobile 4 Series Chipset neglects to set RWBF capability,
3730          * but needs it:
3731          */
3732         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3733         rwbf_quirk = 1;
3734 }
3735
3736 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3737
3738 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3739    ISOCH DMAR unit for the Azalia sound device, but not give it any
3740    TLB entries, which causes it to deadlock. Check for that.  We do
3741    this in a function called from init_dmars(), instead of in a PCI
3742    quirk, because we don't want to print the obnoxious "BIOS broken"
3743    message if VT-d is actually disabled.
3744 */
3745 static void __init check_tylersburg_isoch(void)
3746 {
3747         struct pci_dev *pdev;
3748         uint32_t vtisochctrl;
3749
3750         /* If there's no Azalia in the system anyway, forget it. */
3751         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3752         if (!pdev)
3753                 return;
3754         pci_dev_put(pdev);
3755
3756         /* System Management Registers. Might be hidden, in which case
3757            we can't do the sanity check. But that's OK, because the
3758            known-broken BIOSes _don't_ actually hide it, so far. */
3759         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3760         if (!pdev)
3761                 return;
3762
3763         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3764                 pci_dev_put(pdev);
3765                 return;
3766         }
3767
3768         pci_dev_put(pdev);
3769
3770         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3771         if (vtisochctrl & 1)
3772                 return;
3773
3774         /* Drop all bits other than the number of TLB entries */
3775         vtisochctrl &= 0x1c;
3776
3777         /* If we have the recommended number of TLB entries (16), fine. */
3778         if (vtisochctrl == 0x10)
3779                 return;
3780
3781         /* Zero TLB entries? You get to ride the short bus to school. */
3782         if (!vtisochctrl) {
3783                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3784                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3785                      dmi_get_system_info(DMI_BIOS_VENDOR),
3786                      dmi_get_system_info(DMI_BIOS_VERSION),
3787                      dmi_get_system_info(DMI_PRODUCT_VERSION));
3788                 iommu_identity_mapping |= IDENTMAP_AZALIA;
3789                 return;
3790         }
3791         
3792         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3793                vtisochctrl);
3794 }