]> bbs.cooldavid.org Git - net-next-2.6.git/blob - drivers/pci/intel-iommu.c
PCI: iommu: iotlb flushing
[net-next-2.6.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/gart.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 static struct intel_iommu *g_iommus;
62 /* bitmap for indexing intel_iommus */
63 static unsigned long    *g_iommus_to_flush;
64 static int g_num_of_iommus;
65
66 static DEFINE_SPINLOCK(async_umap_flush_lock);
67 static LIST_HEAD(unmaps_to_do);
68
69 static int timer_on;
70 static long list_size;
71 static int high_watermark;
72
73 static struct dentry *intel_iommu_debug, *debug;
74
75
76 static void domain_remove_dev_info(struct dmar_domain *domain);
77
78 static int dmar_disabled;
79 static int __initdata dmar_map_gfx = 1;
80 static int dmar_forcedac;
81 static int intel_iommu_strict;
82
83 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
84 static DEFINE_SPINLOCK(device_domain_lock);
85 static LIST_HEAD(device_domain_list);
86
87 static int __init intel_iommu_setup(char *str)
88 {
89         if (!str)
90                 return -EINVAL;
91         while (*str) {
92                 if (!strncmp(str, "off", 3)) {
93                         dmar_disabled = 1;
94                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
95                 } else if (!strncmp(str, "igfx_off", 8)) {
96                         dmar_map_gfx = 0;
97                         printk(KERN_INFO
98                                 "Intel-IOMMU: disable GFX device mapping\n");
99                 } else if (!strncmp(str, "forcedac", 8)) {
100                         printk(KERN_INFO
101                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
102                         dmar_forcedac = 1;
103                 } else if (!strncmp(str, "strict", 6)) {
104                         printk(KERN_INFO
105                                 "Intel-IOMMU: disable batched IOTLB flush\n");
106                         intel_iommu_strict = 1;
107                 }
108
109                 str += strcspn(str, ",");
110                 while (*str == ',')
111                         str++;
112         }
113         return 0;
114 }
115 __setup("intel_iommu=", intel_iommu_setup);
116
117 static struct kmem_cache *iommu_domain_cache;
118 static struct kmem_cache *iommu_devinfo_cache;
119 static struct kmem_cache *iommu_iova_cache;
120
121 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
122 {
123         unsigned int flags;
124         void *vaddr;
125
126         /* trying to avoid low memory issues */
127         flags = current->flags & PF_MEMALLOC;
128         current->flags |= PF_MEMALLOC;
129         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
130         current->flags &= (~PF_MEMALLOC | flags);
131         return vaddr;
132 }
133
134
135 static inline void *alloc_pgtable_page(void)
136 {
137         unsigned int flags;
138         void *vaddr;
139
140         /* trying to avoid low memory issues */
141         flags = current->flags & PF_MEMALLOC;
142         current->flags |= PF_MEMALLOC;
143         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
144         current->flags &= (~PF_MEMALLOC | flags);
145         return vaddr;
146 }
147
148 static inline void free_pgtable_page(void *vaddr)
149 {
150         free_page((unsigned long)vaddr);
151 }
152
153 static inline void *alloc_domain_mem(void)
154 {
155         return iommu_kmem_cache_alloc(iommu_domain_cache);
156 }
157
158 static inline void free_domain_mem(void *vaddr)
159 {
160         kmem_cache_free(iommu_domain_cache, vaddr);
161 }
162
163 static inline void * alloc_devinfo_mem(void)
164 {
165         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
166 }
167
168 static inline void free_devinfo_mem(void *vaddr)
169 {
170         kmem_cache_free(iommu_devinfo_cache, vaddr);
171 }
172
173 struct iova *alloc_iova_mem(void)
174 {
175         return iommu_kmem_cache_alloc(iommu_iova_cache);
176 }
177
178 void free_iova_mem(struct iova *iova)
179 {
180         kmem_cache_free(iommu_iova_cache, iova);
181 }
182
183 static inline void __iommu_flush_cache(
184         struct intel_iommu *iommu, void *addr, int size)
185 {
186         if (!ecap_coherent(iommu->ecap))
187                 clflush_cache_range(addr, size);
188 }
189
190 /* Gets context entry for a given bus and devfn */
191 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
192                 u8 bus, u8 devfn)
193 {
194         struct root_entry *root;
195         struct context_entry *context;
196         unsigned long phy_addr;
197         unsigned long flags;
198
199         spin_lock_irqsave(&iommu->lock, flags);
200         root = &iommu->root_entry[bus];
201         context = get_context_addr_from_root(root);
202         if (!context) {
203                 context = (struct context_entry *)alloc_pgtable_page();
204                 if (!context) {
205                         spin_unlock_irqrestore(&iommu->lock, flags);
206                         return NULL;
207                 }
208                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
209                 phy_addr = virt_to_phys((void *)context);
210                 set_root_value(root, phy_addr);
211                 set_root_present(root);
212                 __iommu_flush_cache(iommu, root, sizeof(*root));
213         }
214         spin_unlock_irqrestore(&iommu->lock, flags);
215         return &context[devfn];
216 }
217
218 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220         struct root_entry *root;
221         struct context_entry *context;
222         int ret;
223         unsigned long flags;
224
225         spin_lock_irqsave(&iommu->lock, flags);
226         root = &iommu->root_entry[bus];
227         context = get_context_addr_from_root(root);
228         if (!context) {
229                 ret = 0;
230                 goto out;
231         }
232         ret = context_present(context[devfn]);
233 out:
234         spin_unlock_irqrestore(&iommu->lock, flags);
235         return ret;
236 }
237
238 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         struct root_entry *root;
241         struct context_entry *context;
242         unsigned long flags;
243
244         spin_lock_irqsave(&iommu->lock, flags);
245         root = &iommu->root_entry[bus];
246         context = get_context_addr_from_root(root);
247         if (context) {
248                 context_clear_entry(context[devfn]);
249                 __iommu_flush_cache(iommu, &context[devfn], \
250                         sizeof(*context));
251         }
252         spin_unlock_irqrestore(&iommu->lock, flags);
253 }
254
255 static void free_context_table(struct intel_iommu *iommu)
256 {
257         struct root_entry *root;
258         int i;
259         unsigned long flags;
260         struct context_entry *context;
261
262         spin_lock_irqsave(&iommu->lock, flags);
263         if (!iommu->root_entry) {
264                 goto out;
265         }
266         for (i = 0; i < ROOT_ENTRY_NR; i++) {
267                 root = &iommu->root_entry[i];
268                 context = get_context_addr_from_root(root);
269                 if (context)
270                         free_pgtable_page(context);
271         }
272         free_pgtable_page(iommu->root_entry);
273         iommu->root_entry = NULL;
274 out:
275         spin_unlock_irqrestore(&iommu->lock, flags);
276 }
277
278 /* page table handling */
279 #define LEVEL_STRIDE            (9)
280 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
281
282 static inline int agaw_to_level(int agaw)
283 {
284         return agaw + 2;
285 }
286
287 static inline int agaw_to_width(int agaw)
288 {
289         return 30 + agaw * LEVEL_STRIDE;
290
291 }
292
293 static inline int width_to_agaw(int width)
294 {
295         return (width - 30) / LEVEL_STRIDE;
296 }
297
298 static inline unsigned int level_to_offset_bits(int level)
299 {
300         return (12 + (level - 1) * LEVEL_STRIDE);
301 }
302
303 static inline int address_level_offset(u64 addr, int level)
304 {
305         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
306 }
307
308 static inline u64 level_mask(int level)
309 {
310         return ((u64)-1 << level_to_offset_bits(level));
311 }
312
313 static inline u64 level_size(int level)
314 {
315         return ((u64)1 << level_to_offset_bits(level));
316 }
317
318 static inline u64 align_to_level(u64 addr, int level)
319 {
320         return ((addr + level_size(level) - 1) & level_mask(level));
321 }
322
323 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
324 {
325         int addr_width = agaw_to_width(domain->agaw);
326         struct dma_pte *parent, *pte = NULL;
327         int level = agaw_to_level(domain->agaw);
328         int offset;
329         unsigned long flags;
330
331         BUG_ON(!domain->pgd);
332
333         addr &= (((u64)1) << addr_width) - 1;
334         parent = domain->pgd;
335
336         spin_lock_irqsave(&domain->mapping_lock, flags);
337         while (level > 0) {
338                 void *tmp_page;
339
340                 offset = address_level_offset(addr, level);
341                 pte = &parent[offset];
342                 if (level == 1)
343                         break;
344
345                 if (!dma_pte_present(*pte)) {
346                         tmp_page = alloc_pgtable_page();
347
348                         if (!tmp_page) {
349                                 spin_unlock_irqrestore(&domain->mapping_lock,
350                                         flags);
351                                 return NULL;
352                         }
353                         __iommu_flush_cache(domain->iommu, tmp_page,
354                                         PAGE_SIZE_4K);
355                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
356                         /*
357                          * high level table always sets r/w, last level page
358                          * table control read/write
359                          */
360                         dma_set_pte_readable(*pte);
361                         dma_set_pte_writable(*pte);
362                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
363                 }
364                 parent = phys_to_virt(dma_pte_addr(*pte));
365                 level--;
366         }
367
368         spin_unlock_irqrestore(&domain->mapping_lock, flags);
369         return pte;
370 }
371
372 /* return address's pte at specific level */
373 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
374                 int level)
375 {
376         struct dma_pte *parent, *pte = NULL;
377         int total = agaw_to_level(domain->agaw);
378         int offset;
379
380         parent = domain->pgd;
381         while (level <= total) {
382                 offset = address_level_offset(addr, total);
383                 pte = &parent[offset];
384                 if (level == total)
385                         return pte;
386
387                 if (!dma_pte_present(*pte))
388                         break;
389                 parent = phys_to_virt(dma_pte_addr(*pte));
390                 total--;
391         }
392         return NULL;
393 }
394
395 /* clear one page's page table */
396 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
397 {
398         struct dma_pte *pte = NULL;
399
400         /* get last level pte */
401         pte = dma_addr_level_pte(domain, addr, 1);
402
403         if (pte) {
404                 dma_clear_pte(*pte);
405                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
406         }
407 }
408
409 /* clear last level pte, a tlb flush should be followed */
410 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
411 {
412         int addr_width = agaw_to_width(domain->agaw);
413
414         start &= (((u64)1) << addr_width) - 1;
415         end &= (((u64)1) << addr_width) - 1;
416         /* in case it's partial page */
417         start = PAGE_ALIGN_4K(start);
418         end &= PAGE_MASK_4K;
419
420         /* we don't need lock here, nobody else touches the iova range */
421         while (start < end) {
422                 dma_pte_clear_one(domain, start);
423                 start += PAGE_SIZE_4K;
424         }
425 }
426
427 /* free page table pages. last level pte should already be cleared */
428 static void dma_pte_free_pagetable(struct dmar_domain *domain,
429         u64 start, u64 end)
430 {
431         int addr_width = agaw_to_width(domain->agaw);
432         struct dma_pte *pte;
433         int total = agaw_to_level(domain->agaw);
434         int level;
435         u64 tmp;
436
437         start &= (((u64)1) << addr_width) - 1;
438         end &= (((u64)1) << addr_width) - 1;
439
440         /* we don't need lock here, nobody else touches the iova range */
441         level = 2;
442         while (level <= total) {
443                 tmp = align_to_level(start, level);
444                 if (tmp >= end || (tmp + level_size(level) > end))
445                         return;
446
447                 while (tmp < end) {
448                         pte = dma_addr_level_pte(domain, tmp, level);
449                         if (pte) {
450                                 free_pgtable_page(
451                                         phys_to_virt(dma_pte_addr(*pte)));
452                                 dma_clear_pte(*pte);
453                                 __iommu_flush_cache(domain->iommu,
454                                                 pte, sizeof(*pte));
455                         }
456                         tmp += level_size(level);
457                 }
458                 level++;
459         }
460         /* free pgd */
461         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
462                 free_pgtable_page(domain->pgd);
463                 domain->pgd = NULL;
464         }
465 }
466
467 /* iommu handling */
468 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
469 {
470         struct root_entry *root;
471         unsigned long flags;
472
473         root = (struct root_entry *)alloc_pgtable_page();
474         if (!root)
475                 return -ENOMEM;
476
477         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
478
479         spin_lock_irqsave(&iommu->lock, flags);
480         iommu->root_entry = root;
481         spin_unlock_irqrestore(&iommu->lock, flags);
482
483         return 0;
484 }
485
486 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
487 {\
488         unsigned long start_time = jiffies;\
489         while (1) {\
490                 sts = op (iommu->reg + offset);\
491                 if (cond)\
492                         break;\
493                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
494                         panic("DMAR hardware is malfunctioning\n");\
495                 cpu_relax();\
496         }\
497 }
498
499 static void iommu_set_root_entry(struct intel_iommu *iommu)
500 {
501         void *addr;
502         u32 cmd, sts;
503         unsigned long flag;
504
505         addr = iommu->root_entry;
506
507         spin_lock_irqsave(&iommu->register_lock, flag);
508         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
509
510         cmd = iommu->gcmd | DMA_GCMD_SRTP;
511         writel(cmd, iommu->reg + DMAR_GCMD_REG);
512
513         /* Make sure hardware complete it */
514         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515                 readl, (sts & DMA_GSTS_RTPS), sts);
516
517         spin_unlock_irqrestore(&iommu->register_lock, flag);
518 }
519
520 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
521 {
522         u32 val;
523         unsigned long flag;
524
525         if (!cap_rwbf(iommu->cap))
526                 return;
527         val = iommu->gcmd | DMA_GCMD_WBF;
528
529         spin_lock_irqsave(&iommu->register_lock, flag);
530         writel(val, iommu->reg + DMAR_GCMD_REG);
531
532         /* Make sure hardware complete it */
533         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
534                         readl, (!(val & DMA_GSTS_WBFS)), val);
535
536         spin_unlock_irqrestore(&iommu->register_lock, flag);
537 }
538
539 /* return value determine if we need a write buffer flush */
540 static int __iommu_flush_context(struct intel_iommu *iommu,
541         u16 did, u16 source_id, u8 function_mask, u64 type,
542         int non_present_entry_flush)
543 {
544         u64 val = 0;
545         unsigned long flag;
546
547         /*
548          * In the non-present entry flush case, if hardware doesn't cache
549          * non-present entry we do nothing and if hardware cache non-present
550          * entry, we flush entries of domain 0 (the domain id is used to cache
551          * any non-present entries)
552          */
553         if (non_present_entry_flush) {
554                 if (!cap_caching_mode(iommu->cap))
555                         return 1;
556                 else
557                         did = 0;
558         }
559
560         switch (type) {
561         case DMA_CCMD_GLOBAL_INVL:
562                 val = DMA_CCMD_GLOBAL_INVL;
563                 break;
564         case DMA_CCMD_DOMAIN_INVL:
565                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
566                 break;
567         case DMA_CCMD_DEVICE_INVL:
568                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
569                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
570                 break;
571         default:
572                 BUG();
573         }
574         val |= DMA_CCMD_ICC;
575
576         spin_lock_irqsave(&iommu->register_lock, flag);
577         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
578
579         /* Make sure hardware complete it */
580         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
581                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
582
583         spin_unlock_irqrestore(&iommu->register_lock, flag);
584
585         /* flush context entry will implictly flush write buffer */
586         return 0;
587 }
588
589 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
590         int non_present_entry_flush)
591 {
592         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
593                 non_present_entry_flush);
594 }
595
596 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
597         int non_present_entry_flush)
598 {
599         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
600                 non_present_entry_flush);
601 }
602
603 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
604         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
605 {
606         return __iommu_flush_context(iommu, did, source_id, function_mask,
607                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
608 }
609
610 /* return value determine if we need a write buffer flush */
611 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
612         u64 addr, unsigned int size_order, u64 type,
613         int non_present_entry_flush)
614 {
615         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
616         u64 val = 0, val_iva = 0;
617         unsigned long flag;
618
619         /*
620          * In the non-present entry flush case, if hardware doesn't cache
621          * non-present entry we do nothing and if hardware cache non-present
622          * entry, we flush entries of domain 0 (the domain id is used to cache
623          * any non-present entries)
624          */
625         if (non_present_entry_flush) {
626                 if (!cap_caching_mode(iommu->cap))
627                         return 1;
628                 else
629                         did = 0;
630         }
631
632         switch (type) {
633         case DMA_TLB_GLOBAL_FLUSH:
634                 /* global flush doesn't need set IVA_REG */
635                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
636                 break;
637         case DMA_TLB_DSI_FLUSH:
638                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
639                 break;
640         case DMA_TLB_PSI_FLUSH:
641                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
642                 /* Note: always flush non-leaf currently */
643                 val_iva = size_order | addr;
644                 break;
645         default:
646                 BUG();
647         }
648         /* Note: set drain read/write */
649 #if 0
650         /*
651          * This is probably to be super secure.. Looks like we can
652          * ignore it without any impact.
653          */
654         if (cap_read_drain(iommu->cap))
655                 val |= DMA_TLB_READ_DRAIN;
656 #endif
657         if (cap_write_drain(iommu->cap))
658                 val |= DMA_TLB_WRITE_DRAIN;
659
660         spin_lock_irqsave(&iommu->register_lock, flag);
661         /* Note: Only uses first TLB reg currently */
662         if (val_iva)
663                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
664         dmar_writeq(iommu->reg + tlb_offset + 8, val);
665
666         /* Make sure hardware complete it */
667         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
668                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
669
670         spin_unlock_irqrestore(&iommu->register_lock, flag);
671
672         /* check IOTLB invalidation granularity */
673         if (DMA_TLB_IAIG(val) == 0)
674                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
675         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
676                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
677                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
678         /* flush context entry will implictly flush write buffer */
679         return 0;
680 }
681
682 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
683         int non_present_entry_flush)
684 {
685         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
686                 non_present_entry_flush);
687 }
688
689 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
690         int non_present_entry_flush)
691 {
692         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
693                 non_present_entry_flush);
694 }
695
696 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
697         u64 addr, unsigned int pages, int non_present_entry_flush)
698 {
699         unsigned int mask;
700
701         BUG_ON(addr & (~PAGE_MASK_4K));
702         BUG_ON(pages == 0);
703
704         /* Fallback to domain selective flush if no PSI support */
705         if (!cap_pgsel_inv(iommu->cap))
706                 return iommu_flush_iotlb_dsi(iommu, did,
707                         non_present_entry_flush);
708
709         /*
710          * PSI requires page size to be 2 ^ x, and the base address is naturally
711          * aligned to the size
712          */
713         mask = ilog2(__roundup_pow_of_two(pages));
714         /* Fallback to domain selective flush if size is too big */
715         if (mask > cap_max_amask_val(iommu->cap))
716                 return iommu_flush_iotlb_dsi(iommu, did,
717                         non_present_entry_flush);
718
719         return __iommu_flush_iotlb(iommu, did, addr, mask,
720                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
721 }
722
723 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
724 {
725         u32 pmen;
726         unsigned long flags;
727
728         spin_lock_irqsave(&iommu->register_lock, flags);
729         pmen = readl(iommu->reg + DMAR_PMEN_REG);
730         pmen &= ~DMA_PMEN_EPM;
731         writel(pmen, iommu->reg + DMAR_PMEN_REG);
732
733         /* wait for the protected region status bit to clear */
734         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
735                 readl, !(pmen & DMA_PMEN_PRS), pmen);
736
737         spin_unlock_irqrestore(&iommu->register_lock, flags);
738 }
739
740 static int iommu_enable_translation(struct intel_iommu *iommu)
741 {
742         u32 sts;
743         unsigned long flags;
744
745         spin_lock_irqsave(&iommu->register_lock, flags);
746         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
747
748         /* Make sure hardware complete it */
749         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
750                 readl, (sts & DMA_GSTS_TES), sts);
751
752         iommu->gcmd |= DMA_GCMD_TE;
753         spin_unlock_irqrestore(&iommu->register_lock, flags);
754         return 0;
755 }
756
757 static int iommu_disable_translation(struct intel_iommu *iommu)
758 {
759         u32 sts;
760         unsigned long flag;
761
762         spin_lock_irqsave(&iommu->register_lock, flag);
763         iommu->gcmd &= ~DMA_GCMD_TE;
764         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
765
766         /* Make sure hardware complete it */
767         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
768                 readl, (!(sts & DMA_GSTS_TES)), sts);
769
770         spin_unlock_irqrestore(&iommu->register_lock, flag);
771         return 0;
772 }
773
774 /* iommu interrupt handling. Most stuff are MSI-like. */
775
776 static const char *fault_reason_strings[] =
777 {
778         "Software",
779         "Present bit in root entry is clear",
780         "Present bit in context entry is clear",
781         "Invalid context entry",
782         "Access beyond MGAW",
783         "PTE Write access is not set",
784         "PTE Read access is not set",
785         "Next page table ptr is invalid",
786         "Root table address invalid",
787         "Context table ptr is invalid",
788         "non-zero reserved fields in RTP",
789         "non-zero reserved fields in CTP",
790         "non-zero reserved fields in PTE",
791 };
792 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
793
794 const char *dmar_get_fault_reason(u8 fault_reason)
795 {
796         if (fault_reason > MAX_FAULT_REASON_IDX)
797                 return "Unknown";
798         else
799                 return fault_reason_strings[fault_reason];
800 }
801
802 void dmar_msi_unmask(unsigned int irq)
803 {
804         struct intel_iommu *iommu = get_irq_data(irq);
805         unsigned long flag;
806
807         /* unmask it */
808         spin_lock_irqsave(&iommu->register_lock, flag);
809         writel(0, iommu->reg + DMAR_FECTL_REG);
810         /* Read a reg to force flush the post write */
811         readl(iommu->reg + DMAR_FECTL_REG);
812         spin_unlock_irqrestore(&iommu->register_lock, flag);
813 }
814
815 void dmar_msi_mask(unsigned int irq)
816 {
817         unsigned long flag;
818         struct intel_iommu *iommu = get_irq_data(irq);
819
820         /* mask it */
821         spin_lock_irqsave(&iommu->register_lock, flag);
822         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
823         /* Read a reg to force flush the post write */
824         readl(iommu->reg + DMAR_FECTL_REG);
825         spin_unlock_irqrestore(&iommu->register_lock, flag);
826 }
827
828 void dmar_msi_write(int irq, struct msi_msg *msg)
829 {
830         struct intel_iommu *iommu = get_irq_data(irq);
831         unsigned long flag;
832
833         spin_lock_irqsave(&iommu->register_lock, flag);
834         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
835         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
836         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838 }
839
840 void dmar_msi_read(int irq, struct msi_msg *msg)
841 {
842         struct intel_iommu *iommu = get_irq_data(irq);
843         unsigned long flag;
844
845         spin_lock_irqsave(&iommu->register_lock, flag);
846         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
847         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
848         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
849         spin_unlock_irqrestore(&iommu->register_lock, flag);
850 }
851
852 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
853                 u8 fault_reason, u16 source_id, u64 addr)
854 {
855         const char *reason;
856
857         reason = dmar_get_fault_reason(fault_reason);
858
859         printk(KERN_ERR
860                 "DMAR:[%s] Request device [%02x:%02x.%d] "
861                 "fault addr %llx \n"
862                 "DMAR:[fault reason %02d] %s\n",
863                 (type ? "DMA Read" : "DMA Write"),
864                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
865                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
866         return 0;
867 }
868
869 #define PRIMARY_FAULT_REG_LEN (16)
870 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
871 {
872         struct intel_iommu *iommu = dev_id;
873         int reg, fault_index;
874         u32 fault_status;
875         unsigned long flag;
876
877         spin_lock_irqsave(&iommu->register_lock, flag);
878         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
879
880         /* TBD: ignore advanced fault log currently */
881         if (!(fault_status & DMA_FSTS_PPF))
882                 goto clear_overflow;
883
884         fault_index = dma_fsts_fault_record_index(fault_status);
885         reg = cap_fault_reg_offset(iommu->cap);
886         while (1) {
887                 u8 fault_reason;
888                 u16 source_id;
889                 u64 guest_addr;
890                 int type;
891                 u32 data;
892
893                 /* highest 32 bits */
894                 data = readl(iommu->reg + reg +
895                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
896                 if (!(data & DMA_FRCD_F))
897                         break;
898
899                 fault_reason = dma_frcd_fault_reason(data);
900                 type = dma_frcd_type(data);
901
902                 data = readl(iommu->reg + reg +
903                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
904                 source_id = dma_frcd_source_id(data);
905
906                 guest_addr = dmar_readq(iommu->reg + reg +
907                                 fault_index * PRIMARY_FAULT_REG_LEN);
908                 guest_addr = dma_frcd_page_addr(guest_addr);
909                 /* clear the fault */
910                 writel(DMA_FRCD_F, iommu->reg + reg +
911                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
912
913                 spin_unlock_irqrestore(&iommu->register_lock, flag);
914
915                 iommu_page_fault_do_one(iommu, type, fault_reason,
916                                 source_id, guest_addr);
917
918                 fault_index++;
919                 if (fault_index > cap_num_fault_regs(iommu->cap))
920                         fault_index = 0;
921                 spin_lock_irqsave(&iommu->register_lock, flag);
922         }
923 clear_overflow:
924         /* clear primary fault overflow */
925         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
926         if (fault_status & DMA_FSTS_PFO)
927                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
928
929         spin_unlock_irqrestore(&iommu->register_lock, flag);
930         return IRQ_HANDLED;
931 }
932
933 int dmar_set_interrupt(struct intel_iommu *iommu)
934 {
935         int irq, ret;
936
937         irq = create_irq();
938         if (!irq) {
939                 printk(KERN_ERR "IOMMU: no free vectors\n");
940                 return -EINVAL;
941         }
942
943         set_irq_data(irq, iommu);
944         iommu->irq = irq;
945
946         ret = arch_setup_dmar_msi(irq);
947         if (ret) {
948                 set_irq_data(irq, NULL);
949                 iommu->irq = 0;
950                 destroy_irq(irq);
951                 return 0;
952         }
953
954         /* Force fault register is cleared */
955         iommu_page_fault(irq, iommu);
956
957         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
958         if (ret)
959                 printk(KERN_ERR "IOMMU: can't request irq\n");
960         return ret;
961 }
962
963 static int iommu_init_domains(struct intel_iommu *iommu)
964 {
965         unsigned long ndomains;
966         unsigned long nlongs;
967
968         ndomains = cap_ndoms(iommu->cap);
969         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
970         nlongs = BITS_TO_LONGS(ndomains);
971
972         /* TBD: there might be 64K domains,
973          * consider other allocation for future chip
974          */
975         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
976         if (!iommu->domain_ids) {
977                 printk(KERN_ERR "Allocating domain id array failed\n");
978                 return -ENOMEM;
979         }
980         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
981                         GFP_KERNEL);
982         if (!iommu->domains) {
983                 printk(KERN_ERR "Allocating domain array failed\n");
984                 kfree(iommu->domain_ids);
985                 return -ENOMEM;
986         }
987
988         /*
989          * if Caching mode is set, then invalid translations are tagged
990          * with domainid 0. Hence we need to pre-allocate it.
991          */
992         if (cap_caching_mode(iommu->cap))
993                 set_bit(0, iommu->domain_ids);
994         return 0;
995 }
996 static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
997                                         struct dmar_drhd_unit *drhd)
998 {
999         int ret;
1000         int map_size;
1001         u32 ver;
1002
1003         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
1004         if (!iommu->reg) {
1005                 printk(KERN_ERR "IOMMU: can't map the region\n");
1006                 goto error;
1007         }
1008         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
1009         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
1010
1011         /* the registers might be more than one page */
1012         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
1013                 cap_max_fault_reg_offset(iommu->cap));
1014         map_size = PAGE_ALIGN_4K(map_size);
1015         if (map_size > PAGE_SIZE_4K) {
1016                 iounmap(iommu->reg);
1017                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
1018                 if (!iommu->reg) {
1019                         printk(KERN_ERR "IOMMU: can't map the region\n");
1020                         goto error;
1021                 }
1022         }
1023
1024         ver = readl(iommu->reg + DMAR_VER_REG);
1025         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1026                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1027                 iommu->cap, iommu->ecap);
1028         ret = iommu_init_domains(iommu);
1029         if (ret)
1030                 goto error_unmap;
1031         spin_lock_init(&iommu->lock);
1032         spin_lock_init(&iommu->register_lock);
1033
1034         drhd->iommu = iommu;
1035         return iommu;
1036 error_unmap:
1037         iounmap(iommu->reg);
1038 error:
1039         kfree(iommu);
1040         return NULL;
1041 }
1042
1043 static void domain_exit(struct dmar_domain *domain);
1044 static void free_iommu(struct intel_iommu *iommu)
1045 {
1046         struct dmar_domain *domain;
1047         int i;
1048
1049         if (!iommu)
1050                 return;
1051
1052         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1053         for (; i < cap_ndoms(iommu->cap); ) {
1054                 domain = iommu->domains[i];
1055                 clear_bit(i, iommu->domain_ids);
1056                 domain_exit(domain);
1057                 i = find_next_bit(iommu->domain_ids,
1058                         cap_ndoms(iommu->cap), i+1);
1059         }
1060
1061         if (iommu->gcmd & DMA_GCMD_TE)
1062                 iommu_disable_translation(iommu);
1063
1064         if (iommu->irq) {
1065                 set_irq_data(iommu->irq, NULL);
1066                 /* This will mask the irq */
1067                 free_irq(iommu->irq, iommu);
1068                 destroy_irq(iommu->irq);
1069         }
1070
1071         kfree(iommu->domains);
1072         kfree(iommu->domain_ids);
1073
1074         /* free context mapping */
1075         free_context_table(iommu);
1076
1077         if (iommu->reg)
1078                 iounmap(iommu->reg);
1079         kfree(iommu);
1080 }
1081
1082 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1083 {
1084         unsigned long num;
1085         unsigned long ndomains;
1086         struct dmar_domain *domain;
1087         unsigned long flags;
1088
1089         domain = alloc_domain_mem();
1090         if (!domain)
1091                 return NULL;
1092
1093         ndomains = cap_ndoms(iommu->cap);
1094
1095         spin_lock_irqsave(&iommu->lock, flags);
1096         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1097         if (num >= ndomains) {
1098                 spin_unlock_irqrestore(&iommu->lock, flags);
1099                 free_domain_mem(domain);
1100                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1101                 return NULL;
1102         }
1103
1104         set_bit(num, iommu->domain_ids);
1105         domain->id = num;
1106         domain->iommu = iommu;
1107         iommu->domains[num] = domain;
1108         spin_unlock_irqrestore(&iommu->lock, flags);
1109
1110         return domain;
1111 }
1112
1113 static void iommu_free_domain(struct dmar_domain *domain)
1114 {
1115         unsigned long flags;
1116
1117         spin_lock_irqsave(&domain->iommu->lock, flags);
1118         clear_bit(domain->id, domain->iommu->domain_ids);
1119         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1120 }
1121
1122 static struct iova_domain reserved_iova_list;
1123 static struct lock_class_key reserved_alloc_key;
1124 static struct lock_class_key reserved_rbtree_key;
1125
1126 static void dmar_init_reserved_ranges(void)
1127 {
1128         struct pci_dev *pdev = NULL;
1129         struct iova *iova;
1130         int i;
1131         u64 addr, size;
1132
1133         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1134
1135         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1136                 &reserved_alloc_key);
1137         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1138                 &reserved_rbtree_key);
1139
1140         /* IOAPIC ranges shouldn't be accessed by DMA */
1141         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1142                 IOVA_PFN(IOAPIC_RANGE_END));
1143         if (!iova)
1144                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1145
1146         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1147         for_each_pci_dev(pdev) {
1148                 struct resource *r;
1149
1150                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1151                         r = &pdev->resource[i];
1152                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1153                                 continue;
1154                         addr = r->start;
1155                         addr &= PAGE_MASK_4K;
1156                         size = r->end - addr;
1157                         size = PAGE_ALIGN_4K(size);
1158                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1159                                 IOVA_PFN(size + addr) - 1);
1160                         if (!iova)
1161                                 printk(KERN_ERR "Reserve iova failed\n");
1162                 }
1163         }
1164
1165 }
1166
1167 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1168 {
1169         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1170 }
1171
1172 static inline int guestwidth_to_adjustwidth(int gaw)
1173 {
1174         int agaw;
1175         int r = (gaw - 12) % 9;
1176
1177         if (r == 0)
1178                 agaw = gaw;
1179         else
1180                 agaw = gaw + 9 - r;
1181         if (agaw > 64)
1182                 agaw = 64;
1183         return agaw;
1184 }
1185
1186 static int domain_init(struct dmar_domain *domain, int guest_width)
1187 {
1188         struct intel_iommu *iommu;
1189         int adjust_width, agaw;
1190         unsigned long sagaw;
1191
1192         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1193         spin_lock_init(&domain->mapping_lock);
1194
1195         domain_reserve_special_ranges(domain);
1196
1197         /* calculate AGAW */
1198         iommu = domain->iommu;
1199         if (guest_width > cap_mgaw(iommu->cap))
1200                 guest_width = cap_mgaw(iommu->cap);
1201         domain->gaw = guest_width;
1202         adjust_width = guestwidth_to_adjustwidth(guest_width);
1203         agaw = width_to_agaw(adjust_width);
1204         sagaw = cap_sagaw(iommu->cap);
1205         if (!test_bit(agaw, &sagaw)) {
1206                 /* hardware doesn't support it, choose a bigger one */
1207                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1208                 agaw = find_next_bit(&sagaw, 5, agaw);
1209                 if (agaw >= 5)
1210                         return -ENODEV;
1211         }
1212         domain->agaw = agaw;
1213         INIT_LIST_HEAD(&domain->devices);
1214
1215         /* always allocate the top pgd */
1216         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1217         if (!domain->pgd)
1218                 return -ENOMEM;
1219         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1220         return 0;
1221 }
1222
1223 static void domain_exit(struct dmar_domain *domain)
1224 {
1225         u64 end;
1226
1227         /* Domain 0 is reserved, so dont process it */
1228         if (!domain)
1229                 return;
1230
1231         domain_remove_dev_info(domain);
1232         /* destroy iovas */
1233         put_iova_domain(&domain->iovad);
1234         end = DOMAIN_MAX_ADDR(domain->gaw);
1235         end = end & (~PAGE_MASK_4K);
1236
1237         /* clear ptes */
1238         dma_pte_clear_range(domain, 0, end);
1239
1240         /* free page tables */
1241         dma_pte_free_pagetable(domain, 0, end);
1242
1243         iommu_free_domain(domain);
1244         free_domain_mem(domain);
1245 }
1246
1247 static int domain_context_mapping_one(struct dmar_domain *domain,
1248                 u8 bus, u8 devfn)
1249 {
1250         struct context_entry *context;
1251         struct intel_iommu *iommu = domain->iommu;
1252         unsigned long flags;
1253
1254         pr_debug("Set context mapping for %02x:%02x.%d\n",
1255                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1256         BUG_ON(!domain->pgd);
1257         context = device_to_context_entry(iommu, bus, devfn);
1258         if (!context)
1259                 return -ENOMEM;
1260         spin_lock_irqsave(&iommu->lock, flags);
1261         if (context_present(*context)) {
1262                 spin_unlock_irqrestore(&iommu->lock, flags);
1263                 return 0;
1264         }
1265
1266         context_set_domain_id(*context, domain->id);
1267         context_set_address_width(*context, domain->agaw);
1268         context_set_address_root(*context, virt_to_phys(domain->pgd));
1269         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1270         context_set_fault_enable(*context);
1271         context_set_present(*context);
1272         __iommu_flush_cache(iommu, context, sizeof(*context));
1273
1274         /* it's a non-present to present mapping */
1275         if (iommu_flush_context_device(iommu, domain->id,
1276                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1277                 iommu_flush_write_buffer(iommu);
1278         else
1279                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1280         spin_unlock_irqrestore(&iommu->lock, flags);
1281         return 0;
1282 }
1283
1284 static int
1285 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1286 {
1287         int ret;
1288         struct pci_dev *tmp, *parent;
1289
1290         ret = domain_context_mapping_one(domain, pdev->bus->number,
1291                 pdev->devfn);
1292         if (ret)
1293                 return ret;
1294
1295         /* dependent device mapping */
1296         tmp = pci_find_upstream_pcie_bridge(pdev);
1297         if (!tmp)
1298                 return 0;
1299         /* Secondary interface's bus number and devfn 0 */
1300         parent = pdev->bus->self;
1301         while (parent != tmp) {
1302                 ret = domain_context_mapping_one(domain, parent->bus->number,
1303                         parent->devfn);
1304                 if (ret)
1305                         return ret;
1306                 parent = parent->bus->self;
1307         }
1308         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1309                 return domain_context_mapping_one(domain,
1310                         tmp->subordinate->number, 0);
1311         else /* this is a legacy PCI bridge */
1312                 return domain_context_mapping_one(domain,
1313                         tmp->bus->number, tmp->devfn);
1314 }
1315
1316 static int domain_context_mapped(struct dmar_domain *domain,
1317         struct pci_dev *pdev)
1318 {
1319         int ret;
1320         struct pci_dev *tmp, *parent;
1321
1322         ret = device_context_mapped(domain->iommu,
1323                 pdev->bus->number, pdev->devfn);
1324         if (!ret)
1325                 return ret;
1326         /* dependent device mapping */
1327         tmp = pci_find_upstream_pcie_bridge(pdev);
1328         if (!tmp)
1329                 return ret;
1330         /* Secondary interface's bus number and devfn 0 */
1331         parent = pdev->bus->self;
1332         while (parent != tmp) {
1333                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1334                         parent->devfn);
1335                 if (!ret)
1336                         return ret;
1337                 parent = parent->bus->self;
1338         }
1339         if (tmp->is_pcie)
1340                 return device_context_mapped(domain->iommu,
1341                         tmp->subordinate->number, 0);
1342         else
1343                 return device_context_mapped(domain->iommu,
1344                         tmp->bus->number, tmp->devfn);
1345 }
1346
1347 static int
1348 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1349                         u64 hpa, size_t size, int prot)
1350 {
1351         u64 start_pfn, end_pfn;
1352         struct dma_pte *pte;
1353         int index;
1354
1355         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1356                 return -EINVAL;
1357         iova &= PAGE_MASK_4K;
1358         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1359         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1360         index = 0;
1361         while (start_pfn < end_pfn) {
1362                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1363                 if (!pte)
1364                         return -ENOMEM;
1365                 /* We don't need lock here, nobody else
1366                  * touches the iova range
1367                  */
1368                 BUG_ON(dma_pte_addr(*pte));
1369                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1370                 dma_set_pte_prot(*pte, prot);
1371                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1372                 start_pfn++;
1373                 index++;
1374         }
1375         return 0;
1376 }
1377
1378 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1379 {
1380         clear_context_table(domain->iommu, bus, devfn);
1381         iommu_flush_context_global(domain->iommu, 0);
1382         iommu_flush_iotlb_global(domain->iommu, 0);
1383 }
1384
1385 static void domain_remove_dev_info(struct dmar_domain *domain)
1386 {
1387         struct device_domain_info *info;
1388         unsigned long flags;
1389
1390         spin_lock_irqsave(&device_domain_lock, flags);
1391         while (!list_empty(&domain->devices)) {
1392                 info = list_entry(domain->devices.next,
1393                         struct device_domain_info, link);
1394                 list_del(&info->link);
1395                 list_del(&info->global);
1396                 if (info->dev)
1397                         info->dev->dev.archdata.iommu = NULL;
1398                 spin_unlock_irqrestore(&device_domain_lock, flags);
1399
1400                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1401                 free_devinfo_mem(info);
1402
1403                 spin_lock_irqsave(&device_domain_lock, flags);
1404         }
1405         spin_unlock_irqrestore(&device_domain_lock, flags);
1406 }
1407
1408 /*
1409  * find_domain
1410  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1411  */
1412 struct dmar_domain *
1413 find_domain(struct pci_dev *pdev)
1414 {
1415         struct device_domain_info *info;
1416
1417         /* No lock here, assumes no domain exit in normal case */
1418         info = pdev->dev.archdata.iommu;
1419         if (info)
1420                 return info->domain;
1421         return NULL;
1422 }
1423
1424 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1425      struct pci_dev *dev)
1426 {
1427         int index;
1428
1429         while (dev) {
1430                 for (index = 0; index < cnt; index++)
1431                         if (dev == devices[index])
1432                                 return 1;
1433
1434                 /* Check our parent */
1435                 dev = dev->bus->self;
1436         }
1437
1438         return 0;
1439 }
1440
1441 static struct dmar_drhd_unit *
1442 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1443 {
1444         struct dmar_drhd_unit *drhd = NULL;
1445
1446         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1447                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1448                                                 drhd->devices_cnt, dev))
1449                         return drhd;
1450         }
1451
1452         return NULL;
1453 }
1454
1455 /* domain is initialized */
1456 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1457 {
1458         struct dmar_domain *domain, *found = NULL;
1459         struct intel_iommu *iommu;
1460         struct dmar_drhd_unit *drhd;
1461         struct device_domain_info *info, *tmp;
1462         struct pci_dev *dev_tmp;
1463         unsigned long flags;
1464         int bus = 0, devfn = 0;
1465
1466         domain = find_domain(pdev);
1467         if (domain)
1468                 return domain;
1469
1470         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1471         if (dev_tmp) {
1472                 if (dev_tmp->is_pcie) {
1473                         bus = dev_tmp->subordinate->number;
1474                         devfn = 0;
1475                 } else {
1476                         bus = dev_tmp->bus->number;
1477                         devfn = dev_tmp->devfn;
1478                 }
1479                 spin_lock_irqsave(&device_domain_lock, flags);
1480                 list_for_each_entry(info, &device_domain_list, global) {
1481                         if (info->bus == bus && info->devfn == devfn) {
1482                                 found = info->domain;
1483                                 break;
1484                         }
1485                 }
1486                 spin_unlock_irqrestore(&device_domain_lock, flags);
1487                 /* pcie-pci bridge already has a domain, uses it */
1488                 if (found) {
1489                         domain = found;
1490                         goto found_domain;
1491                 }
1492         }
1493
1494         /* Allocate new domain for the device */
1495         drhd = dmar_find_matched_drhd_unit(pdev);
1496         if (!drhd) {
1497                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1498                         pci_name(pdev));
1499                 return NULL;
1500         }
1501         iommu = drhd->iommu;
1502
1503         domain = iommu_alloc_domain(iommu);
1504         if (!domain)
1505                 goto error;
1506
1507         if (domain_init(domain, gaw)) {
1508                 domain_exit(domain);
1509                 goto error;
1510         }
1511
1512         /* register pcie-to-pci device */
1513         if (dev_tmp) {
1514                 info = alloc_devinfo_mem();
1515                 if (!info) {
1516                         domain_exit(domain);
1517                         goto error;
1518                 }
1519                 info->bus = bus;
1520                 info->devfn = devfn;
1521                 info->dev = NULL;
1522                 info->domain = domain;
1523                 /* This domain is shared by devices under p2p bridge */
1524                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1525
1526                 /* pcie-to-pci bridge already has a domain, uses it */
1527                 found = NULL;
1528                 spin_lock_irqsave(&device_domain_lock, flags);
1529                 list_for_each_entry(tmp, &device_domain_list, global) {
1530                         if (tmp->bus == bus && tmp->devfn == devfn) {
1531                                 found = tmp->domain;
1532                                 break;
1533                         }
1534                 }
1535                 if (found) {
1536                         free_devinfo_mem(info);
1537                         domain_exit(domain);
1538                         domain = found;
1539                 } else {
1540                         list_add(&info->link, &domain->devices);
1541                         list_add(&info->global, &device_domain_list);
1542                 }
1543                 spin_unlock_irqrestore(&device_domain_lock, flags);
1544         }
1545
1546 found_domain:
1547         info = alloc_devinfo_mem();
1548         if (!info)
1549                 goto error;
1550         info->bus = pdev->bus->number;
1551         info->devfn = pdev->devfn;
1552         info->dev = pdev;
1553         info->domain = domain;
1554         spin_lock_irqsave(&device_domain_lock, flags);
1555         /* somebody is fast */
1556         found = find_domain(pdev);
1557         if (found != NULL) {
1558                 spin_unlock_irqrestore(&device_domain_lock, flags);
1559                 if (found != domain) {
1560                         domain_exit(domain);
1561                         domain = found;
1562                 }
1563                 free_devinfo_mem(info);
1564                 return domain;
1565         }
1566         list_add(&info->link, &domain->devices);
1567         list_add(&info->global, &device_domain_list);
1568         pdev->dev.archdata.iommu = info;
1569         spin_unlock_irqrestore(&device_domain_lock, flags);
1570         return domain;
1571 error:
1572         /* recheck it here, maybe others set it */
1573         return find_domain(pdev);
1574 }
1575
1576 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1577 {
1578         struct dmar_domain *domain;
1579         unsigned long size;
1580         u64 base;
1581         int ret;
1582
1583         printk(KERN_INFO
1584                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1585                 pci_name(pdev), start, end);
1586         /* page table init */
1587         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1588         if (!domain)
1589                 return -ENOMEM;
1590
1591         /* The address might not be aligned */
1592         base = start & PAGE_MASK_4K;
1593         size = end - base;
1594         size = PAGE_ALIGN_4K(size);
1595         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1596                         IOVA_PFN(base + size) - 1)) {
1597                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1598                 ret = -ENOMEM;
1599                 goto error;
1600         }
1601
1602         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1603                 size, base, pci_name(pdev));
1604         /*
1605          * RMRR range might have overlap with physical memory range,
1606          * clear it first
1607          */
1608         dma_pte_clear_range(domain, base, base + size);
1609
1610         ret = domain_page_mapping(domain, base, base, size,
1611                 DMA_PTE_READ|DMA_PTE_WRITE);
1612         if (ret)
1613                 goto error;
1614
1615         /* context entry init */
1616         ret = domain_context_mapping(domain, pdev);
1617         if (!ret)
1618                 return 0;
1619 error:
1620         domain_exit(domain);
1621         return ret;
1622
1623 }
1624
1625 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1626         struct pci_dev *pdev)
1627 {
1628         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1629                 return 0;
1630         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1631                 rmrr->end_address + 1);
1632 }
1633
1634 #ifdef CONFIG_DMAR_GFX_WA
1635 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1636 static void __init iommu_prepare_gfx_mapping(void)
1637 {
1638         struct pci_dev *pdev = NULL;
1639         u64 base, size;
1640         int slot;
1641         int ret;
1642
1643         for_each_pci_dev(pdev) {
1644                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1645                                 !IS_GFX_DEVICE(pdev))
1646                         continue;
1647                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1648                         pci_name(pdev));
1649                 slot = arch_get_ram_range(0, &base, &size);
1650                 while (slot >= 0) {
1651                         ret = iommu_prepare_identity_map(pdev,
1652                                         base, base + size);
1653                         if (ret)
1654                                 goto error;
1655                         slot = arch_get_ram_range(slot, &base, &size);
1656                 }
1657                 continue;
1658 error:
1659                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1660         }
1661 }
1662 #endif
1663
1664 #ifdef CONFIG_DMAR_FLOPPY_WA
1665 static inline void iommu_prepare_isa(void)
1666 {
1667         struct pci_dev *pdev;
1668         int ret;
1669
1670         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1671         if (!pdev)
1672                 return;
1673
1674         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1675         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1676
1677         if (ret)
1678                 printk("IOMMU: Failed to create 0-64M identity map, "
1679                         "floppy might not work\n");
1680
1681 }
1682 #else
1683 static inline void iommu_prepare_isa(void)
1684 {
1685         return;
1686 }
1687 #endif /* !CONFIG_DMAR_FLPY_WA */
1688
1689 int __init init_dmars(void)
1690 {
1691         struct dmar_drhd_unit *drhd;
1692         struct dmar_rmrr_unit *rmrr;
1693         struct pci_dev *pdev;
1694         struct intel_iommu *iommu;
1695         int nlongs, i, ret, unit = 0;
1696
1697         /*
1698          * for each drhd
1699          *    allocate root
1700          *    initialize and program root entry to not present
1701          * endfor
1702          */
1703         for_each_drhd_unit(drhd) {
1704                 if (drhd->ignored)
1705                         continue;
1706                 g_num_of_iommus++;
1707                 /*
1708                  * lock not needed as this is only incremented in the single
1709                  * threaded kernel __init code path all other access are read
1710                  * only
1711                  */
1712         }
1713
1714         nlongs = BITS_TO_LONGS(g_num_of_iommus);
1715         g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
1716         if (!g_iommus_to_flush) {
1717                 printk(KERN_ERR "Intel-IOMMU: "
1718                         "Allocating bitmap array failed\n");
1719                 return -ENOMEM;
1720         }
1721
1722         g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
1723         if (!g_iommus) {
1724                 kfree(g_iommus_to_flush);
1725                 ret = -ENOMEM;
1726                 goto error;
1727         }
1728
1729         i = 0;
1730         for_each_drhd_unit(drhd) {
1731                 if (drhd->ignored)
1732                         continue;
1733                 iommu = alloc_iommu(&g_iommus[i], drhd);
1734                 i++;
1735                 if (!iommu) {
1736                         ret = -ENOMEM;
1737                         goto error;
1738                 }
1739
1740                 /*
1741                  * TBD:
1742                  * we could share the same root & context tables
1743                  * amoung all IOMMU's. Need to Split it later.
1744                  */
1745                 ret = iommu_alloc_root_entry(iommu);
1746                 if (ret) {
1747                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1748                         goto error;
1749                 }
1750         }
1751
1752         /*
1753          * For each rmrr
1754          *   for each dev attached to rmrr
1755          *   do
1756          *     locate drhd for dev, alloc domain for dev
1757          *     allocate free domain
1758          *     allocate page table entries for rmrr
1759          *     if context not allocated for bus
1760          *           allocate and init context
1761          *           set present in root table for this bus
1762          *     init context with domain, translation etc
1763          *    endfor
1764          * endfor
1765          */
1766         for_each_rmrr_units(rmrr) {
1767                 for (i = 0; i < rmrr->devices_cnt; i++) {
1768                         pdev = rmrr->devices[i];
1769                         /* some BIOS lists non-exist devices in DMAR table */
1770                         if (!pdev)
1771                                 continue;
1772                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1773                         if (ret)
1774                                 printk(KERN_ERR
1775                                  "IOMMU: mapping reserved region failed\n");
1776                 }
1777         }
1778
1779         iommu_prepare_gfx_mapping();
1780
1781         iommu_prepare_isa();
1782
1783         /*
1784          * for each drhd
1785          *   enable fault log
1786          *   global invalidate context cache
1787          *   global invalidate iotlb
1788          *   enable translation
1789          */
1790         for_each_drhd_unit(drhd) {
1791                 if (drhd->ignored)
1792                         continue;
1793                 iommu = drhd->iommu;
1794                 sprintf (iommu->name, "dmar%d", unit++);
1795
1796                 iommu_flush_write_buffer(iommu);
1797
1798                 ret = dmar_set_interrupt(iommu);
1799                 if (ret)
1800                         goto error;
1801
1802                 iommu_set_root_entry(iommu);
1803
1804                 iommu_flush_context_global(iommu, 0);
1805                 iommu_flush_iotlb_global(iommu, 0);
1806
1807                 iommu_disable_protect_mem_regions(iommu);
1808
1809                 ret = iommu_enable_translation(iommu);
1810                 if (ret)
1811                         goto error;
1812         }
1813
1814         return 0;
1815 error:
1816         for_each_drhd_unit(drhd) {
1817                 if (drhd->ignored)
1818                         continue;
1819                 iommu = drhd->iommu;
1820                 free_iommu(iommu);
1821         }
1822         kfree(g_iommus);
1823         return ret;
1824 }
1825
1826 static inline u64 aligned_size(u64 host_addr, size_t size)
1827 {
1828         u64 addr;
1829         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1830         return PAGE_ALIGN_4K(addr);
1831 }
1832
1833 struct iova *
1834 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1835 {
1836         struct iova *piova;
1837
1838         /* Make sure it's in range */
1839         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1840         if (!size || (IOVA_START_ADDR + size > end))
1841                 return NULL;
1842
1843         piova = alloc_iova(&domain->iovad,
1844                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1845         return piova;
1846 }
1847
1848 static struct iova *
1849 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1850                 size_t size)
1851 {
1852         struct pci_dev *pdev = to_pci_dev(dev);
1853         struct iova *iova = NULL;
1854
1855         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1856                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1857         } else  {
1858                 /*
1859                  * First try to allocate an io virtual address in
1860                  * DMA_32BIT_MASK and if that fails then try allocating
1861                  * from higher range
1862                  */
1863                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1864                 if (!iova)
1865                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1866         }
1867
1868         if (!iova) {
1869                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1870                 return NULL;
1871         }
1872
1873         return iova;
1874 }
1875
1876 static struct dmar_domain *
1877 get_valid_domain_for_dev(struct pci_dev *pdev)
1878 {
1879         struct dmar_domain *domain;
1880         int ret;
1881
1882         domain = get_domain_for_dev(pdev,
1883                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1884         if (!domain) {
1885                 printk(KERN_ERR
1886                         "Allocating domain for %s failed", pci_name(pdev));
1887                 return NULL;
1888         }
1889
1890         /* make sure context mapping is ok */
1891         if (unlikely(!domain_context_mapped(domain, pdev))) {
1892                 ret = domain_context_mapping(domain, pdev);
1893                 if (ret) {
1894                         printk(KERN_ERR
1895                                 "Domain context map for %s failed",
1896                                 pci_name(pdev));
1897                         return NULL;
1898                 }
1899         }
1900
1901         return domain;
1902 }
1903
1904 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1905         size_t size, int dir)
1906 {
1907         struct pci_dev *pdev = to_pci_dev(hwdev);
1908         int ret;
1909         struct dmar_domain *domain;
1910         unsigned long start_addr;
1911         struct iova *iova;
1912         int prot = 0;
1913
1914         BUG_ON(dir == DMA_NONE);
1915         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1916                 return virt_to_bus(addr);
1917
1918         domain = get_valid_domain_for_dev(pdev);
1919         if (!domain)
1920                 return 0;
1921
1922         addr = (void *)virt_to_phys(addr);
1923         size = aligned_size((u64)addr, size);
1924
1925         iova = __intel_alloc_iova(hwdev, domain, size);
1926         if (!iova)
1927                 goto error;
1928
1929         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1930
1931         /*
1932          * Check if DMAR supports zero-length reads on write only
1933          * mappings..
1934          */
1935         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1936                         !cap_zlr(domain->iommu->cap))
1937                 prot |= DMA_PTE_READ;
1938         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1939                 prot |= DMA_PTE_WRITE;
1940         /*
1941          * addr - (addr + size) might be partial page, we should map the whole
1942          * page.  Note: if two part of one page are separately mapped, we
1943          * might have two guest_addr mapping to the same host addr, but this
1944          * is not a big problem
1945          */
1946         ret = domain_page_mapping(domain, start_addr,
1947                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1948         if (ret)
1949                 goto error;
1950
1951         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1952                 pci_name(pdev), size, (u64)addr,
1953                 size, (u64)start_addr, dir);
1954
1955         /* it's a non-present to present mapping */
1956         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1957                         start_addr, size >> PAGE_SHIFT_4K, 1);
1958         if (ret)
1959                 iommu_flush_write_buffer(domain->iommu);
1960
1961         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1962
1963 error:
1964         if (iova)
1965                 __free_iova(&domain->iovad, iova);
1966         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1967                 pci_name(pdev), size, (u64)addr, dir);
1968         return 0;
1969 }
1970
1971 static void flush_unmaps(void)
1972 {
1973         struct iova *node, *n;
1974         unsigned long flags;
1975         int i;
1976
1977         spin_lock_irqsave(&async_umap_flush_lock, flags);
1978         timer_on = 0;
1979
1980         /* just flush them all */
1981         for (i = 0; i < g_num_of_iommus; i++) {
1982                 if (test_and_clear_bit(i, g_iommus_to_flush))
1983                         iommu_flush_iotlb_global(&g_iommus[i], 0);
1984         }
1985
1986         list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
1987                 /* free iova */
1988                 list_del(&node->list);
1989                 __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
1990
1991         }
1992         list_size = 0;
1993         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1994 }
1995
1996 static void flush_unmaps_timeout(unsigned long data)
1997 {
1998         flush_unmaps();
1999 }
2000
2001 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2002 {
2003         unsigned long flags;
2004
2005         spin_lock_irqsave(&async_umap_flush_lock, flags);
2006         iova->dmar = dom;
2007         list_add(&iova->list, &unmaps_to_do);
2008         set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
2009
2010         if (!timer_on) {
2011                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2012                 timer_on = 1;
2013         }
2014         list_size++;
2015         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2016 }
2017
2018 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
2019         size_t size, int dir)
2020 {
2021         struct pci_dev *pdev = to_pci_dev(dev);
2022         struct dmar_domain *domain;
2023         unsigned long start_addr;
2024         struct iova *iova;
2025
2026         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2027                 return;
2028         domain = find_domain(pdev);
2029         BUG_ON(!domain);
2030
2031         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2032         if (!iova)
2033                 return;
2034
2035         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2036         size = aligned_size((u64)dev_addr, size);
2037
2038         pr_debug("Device %s unmapping: %lx@%llx\n",
2039                 pci_name(pdev), size, (u64)start_addr);
2040
2041         /*  clear the whole page */
2042         dma_pte_clear_range(domain, start_addr, start_addr + size);
2043         /* free page tables */
2044         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2045         if (intel_iommu_strict) {
2046                 if (iommu_flush_iotlb_psi(domain->iommu,
2047                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
2048                         iommu_flush_write_buffer(domain->iommu);
2049                 /* free iova */
2050                 __free_iova(&domain->iovad, iova);
2051         } else {
2052                 add_unmap(domain, iova);
2053                 /*
2054                  * queue up the release of the unmap to save the 1/6th of the
2055                  * cpu used up by the iotlb flush operation...
2056                  */
2057                 if (list_size > high_watermark)
2058                         flush_unmaps();
2059         }
2060 }
2061
2062 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2063                        dma_addr_t *dma_handle, gfp_t flags)
2064 {
2065         void *vaddr;
2066         int order;
2067
2068         size = PAGE_ALIGN_4K(size);
2069         order = get_order(size);
2070         flags &= ~(GFP_DMA | GFP_DMA32);
2071
2072         vaddr = (void *)__get_free_pages(flags, order);
2073         if (!vaddr)
2074                 return NULL;
2075         memset(vaddr, 0, size);
2076
2077         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
2078         if (*dma_handle)
2079                 return vaddr;
2080         free_pages((unsigned long)vaddr, order);
2081         return NULL;
2082 }
2083
2084 static void intel_free_coherent(struct device *hwdev, size_t size,
2085         void *vaddr, dma_addr_t dma_handle)
2086 {
2087         int order;
2088
2089         size = PAGE_ALIGN_4K(size);
2090         order = get_order(size);
2091
2092         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2093         free_pages((unsigned long)vaddr, order);
2094 }
2095
2096 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2097 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2098         int nelems, int dir)
2099 {
2100         int i;
2101         struct pci_dev *pdev = to_pci_dev(hwdev);
2102         struct dmar_domain *domain;
2103         unsigned long start_addr;
2104         struct iova *iova;
2105         size_t size = 0;
2106         void *addr;
2107         struct scatterlist *sg;
2108
2109         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2110                 return;
2111
2112         domain = find_domain(pdev);
2113
2114         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2115         if (!iova)
2116                 return;
2117         for_each_sg(sglist, sg, nelems, i) {
2118                 addr = SG_ENT_VIRT_ADDRESS(sg);
2119                 size += aligned_size((u64)addr, sg->length);
2120         }
2121
2122         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2123
2124         /*  clear the whole page */
2125         dma_pte_clear_range(domain, start_addr, start_addr + size);
2126         /* free page tables */
2127         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2128
2129         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2130                         size >> PAGE_SHIFT_4K, 0))
2131                 iommu_flush_write_buffer(domain->iommu);
2132
2133         /* free iova */
2134         __free_iova(&domain->iovad, iova);
2135 }
2136
2137 static int intel_nontranslate_map_sg(struct device *hddev,
2138         struct scatterlist *sglist, int nelems, int dir)
2139 {
2140         int i;
2141         struct scatterlist *sg;
2142
2143         for_each_sg(sglist, sg, nelems, i) {
2144                 BUG_ON(!sg_page(sg));
2145                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2146                 sg->dma_length = sg->length;
2147         }
2148         return nelems;
2149 }
2150
2151 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2152                                 int nelems, int dir)
2153 {
2154         void *addr;
2155         int i;
2156         struct pci_dev *pdev = to_pci_dev(hwdev);
2157         struct dmar_domain *domain;
2158         size_t size = 0;
2159         int prot = 0;
2160         size_t offset = 0;
2161         struct iova *iova = NULL;
2162         int ret;
2163         struct scatterlist *sg;
2164         unsigned long start_addr;
2165
2166         BUG_ON(dir == DMA_NONE);
2167         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2168                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2169
2170         domain = get_valid_domain_for_dev(pdev);
2171         if (!domain)
2172                 return 0;
2173
2174         for_each_sg(sglist, sg, nelems, i) {
2175                 addr = SG_ENT_VIRT_ADDRESS(sg);
2176                 addr = (void *)virt_to_phys(addr);
2177                 size += aligned_size((u64)addr, sg->length);
2178         }
2179
2180         iova = __intel_alloc_iova(hwdev, domain, size);
2181         if (!iova) {
2182                 sglist->dma_length = 0;
2183                 return 0;
2184         }
2185
2186         /*
2187          * Check if DMAR supports zero-length reads on write only
2188          * mappings..
2189          */
2190         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2191                         !cap_zlr(domain->iommu->cap))
2192                 prot |= DMA_PTE_READ;
2193         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2194                 prot |= DMA_PTE_WRITE;
2195
2196         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2197         offset = 0;
2198         for_each_sg(sglist, sg, nelems, i) {
2199                 addr = SG_ENT_VIRT_ADDRESS(sg);
2200                 addr = (void *)virt_to_phys(addr);
2201                 size = aligned_size((u64)addr, sg->length);
2202                 ret = domain_page_mapping(domain, start_addr + offset,
2203                         ((u64)addr) & PAGE_MASK_4K,
2204                         size, prot);
2205                 if (ret) {
2206                         /*  clear the page */
2207                         dma_pte_clear_range(domain, start_addr,
2208                                   start_addr + offset);
2209                         /* free page tables */
2210                         dma_pte_free_pagetable(domain, start_addr,
2211                                   start_addr + offset);
2212                         /* free iova */
2213                         __free_iova(&domain->iovad, iova);
2214                         return 0;
2215                 }
2216                 sg->dma_address = start_addr + offset +
2217                                 ((u64)addr & (~PAGE_MASK_4K));
2218                 sg->dma_length = sg->length;
2219                 offset += size;
2220         }
2221
2222         /* it's a non-present to present mapping */
2223         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2224                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2225                 iommu_flush_write_buffer(domain->iommu);
2226         return nelems;
2227 }
2228
2229 static struct dma_mapping_ops intel_dma_ops = {
2230         .alloc_coherent = intel_alloc_coherent,
2231         .free_coherent = intel_free_coherent,
2232         .map_single = intel_map_single,
2233         .unmap_single = intel_unmap_single,
2234         .map_sg = intel_map_sg,
2235         .unmap_sg = intel_unmap_sg,
2236 };
2237
2238 static inline int iommu_domain_cache_init(void)
2239 {
2240         int ret = 0;
2241
2242         iommu_domain_cache = kmem_cache_create("iommu_domain",
2243                                          sizeof(struct dmar_domain),
2244                                          0,
2245                                          SLAB_HWCACHE_ALIGN,
2246
2247                                          NULL);
2248         if (!iommu_domain_cache) {
2249                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2250                 ret = -ENOMEM;
2251         }
2252
2253         return ret;
2254 }
2255
2256 static inline int iommu_devinfo_cache_init(void)
2257 {
2258         int ret = 0;
2259
2260         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2261                                          sizeof(struct device_domain_info),
2262                                          0,
2263                                          SLAB_HWCACHE_ALIGN,
2264
2265                                          NULL);
2266         if (!iommu_devinfo_cache) {
2267                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2268                 ret = -ENOMEM;
2269         }
2270
2271         return ret;
2272 }
2273
2274 static inline int iommu_iova_cache_init(void)
2275 {
2276         int ret = 0;
2277
2278         iommu_iova_cache = kmem_cache_create("iommu_iova",
2279                                          sizeof(struct iova),
2280                                          0,
2281                                          SLAB_HWCACHE_ALIGN,
2282
2283                                          NULL);
2284         if (!iommu_iova_cache) {
2285                 printk(KERN_ERR "Couldn't create iova cache\n");
2286                 ret = -ENOMEM;
2287         }
2288
2289         return ret;
2290 }
2291
2292 static int __init iommu_init_mempool(void)
2293 {
2294         int ret;
2295         ret = iommu_iova_cache_init();
2296         if (ret)
2297                 return ret;
2298
2299         ret = iommu_domain_cache_init();
2300         if (ret)
2301                 goto domain_error;
2302
2303         ret = iommu_devinfo_cache_init();
2304         if (!ret)
2305                 return ret;
2306
2307         kmem_cache_destroy(iommu_domain_cache);
2308 domain_error:
2309         kmem_cache_destroy(iommu_iova_cache);
2310
2311         return -ENOMEM;
2312 }
2313
2314 static void __init iommu_exit_mempool(void)
2315 {
2316         kmem_cache_destroy(iommu_devinfo_cache);
2317         kmem_cache_destroy(iommu_domain_cache);
2318         kmem_cache_destroy(iommu_iova_cache);
2319
2320 }
2321
2322 void __init detect_intel_iommu(void)
2323 {
2324         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2325                 return;
2326         if (early_dmar_detect()) {
2327                 iommu_detected = 1;
2328         }
2329 }
2330
2331 static void __init init_no_remapping_devices(void)
2332 {
2333         struct dmar_drhd_unit *drhd;
2334
2335         for_each_drhd_unit(drhd) {
2336                 if (!drhd->include_all) {
2337                         int i;
2338                         for (i = 0; i < drhd->devices_cnt; i++)
2339                                 if (drhd->devices[i] != NULL)
2340                                         break;
2341                         /* ignore DMAR unit if no pci devices exist */
2342                         if (i == drhd->devices_cnt)
2343                                 drhd->ignored = 1;
2344                 }
2345         }
2346
2347         if (dmar_map_gfx)
2348                 return;
2349
2350         for_each_drhd_unit(drhd) {
2351                 int i;
2352                 if (drhd->ignored || drhd->include_all)
2353                         continue;
2354
2355                 for (i = 0; i < drhd->devices_cnt; i++)
2356                         if (drhd->devices[i] &&
2357                                 !IS_GFX_DEVICE(drhd->devices[i]))
2358                                 break;
2359
2360                 if (i < drhd->devices_cnt)
2361                         continue;
2362
2363                 /* bypass IOMMU if it is just for gfx devices */
2364                 drhd->ignored = 1;
2365                 for (i = 0; i < drhd->devices_cnt; i++) {
2366                         if (!drhd->devices[i])
2367                                 continue;
2368                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2369                 }
2370         }
2371 }
2372
2373 int __init intel_iommu_init(void)
2374 {
2375         int ret = 0;
2376
2377         if (no_iommu || swiotlb || dmar_disabled)
2378                 return -ENODEV;
2379
2380         if (dmar_table_init())
2381                 return  -ENODEV;
2382
2383         high_watermark = 250;
2384         intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
2385         debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
2386                                         intel_iommu_debug, &high_watermark);
2387         iommu_init_mempool();
2388         dmar_init_reserved_ranges();
2389
2390         init_no_remapping_devices();
2391
2392         ret = init_dmars();
2393         if (ret) {
2394                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2395                 put_iova_domain(&reserved_iova_list);
2396                 iommu_exit_mempool();
2397                 return ret;
2398         }
2399         printk(KERN_INFO
2400         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2401
2402         init_timer(&unmap_timer);
2403         force_iommu = 1;
2404         dma_ops = &intel_dma_ops;
2405         return 0;
2406 }
2407