]> bbs.cooldavid.org Git - net-next-2.6.git/blob - fs/btrfs/extent-tree.c
3367278ac6a10e0fe93be083adcb6f9232483ee3
[net-next-2.6.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include "compat.h"
27 #include "hash.h"
28 #include "ctree.h"
29 #include "disk-io.h"
30 #include "print-tree.h"
31 #include "transaction.h"
32 #include "volumes.h"
33 #include "locking.h"
34 #include "free-space-cache.h"
35
36 static int update_block_group(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               u64 bytenr, u64 num_bytes, int alloc);
39 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40                                  u64 num_bytes, int reserve, int sinfo);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *extent_root, u64 alloc_bytes,
62                           u64 flags, int force);
63 static int find_next_key(struct btrfs_path *path, int level,
64                          struct btrfs_key *key);
65 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
66                             int dump_block_groups);
67 static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
68                                 struct btrfs_root *root,
69                                 struct btrfs_space_info *sinfo, u64 num_bytes);
70 static int shrink_delalloc(struct btrfs_trans_handle *trans,
71                            struct btrfs_root *root,
72                            struct btrfs_space_info *sinfo, u64 to_reclaim);
73
74 static noinline int
75 block_group_cache_done(struct btrfs_block_group_cache *cache)
76 {
77         smp_mb();
78         return cache->cached == BTRFS_CACHE_FINISHED;
79 }
80
81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
82 {
83         return (cache->flags & bits) == bits;
84 }
85
86 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
87 {
88         atomic_inc(&cache->count);
89 }
90
91 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92 {
93         if (atomic_dec_and_test(&cache->count)) {
94                 WARN_ON(cache->pinned > 0);
95                 WARN_ON(cache->reserved > 0);
96                 WARN_ON(cache->reserved_pinned > 0);
97                 kfree(cache);
98         }
99 }
100
101 /*
102  * this adds the block group to the fs_info rb tree for the block group
103  * cache
104  */
105 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
106                                 struct btrfs_block_group_cache *block_group)
107 {
108         struct rb_node **p;
109         struct rb_node *parent = NULL;
110         struct btrfs_block_group_cache *cache;
111
112         spin_lock(&info->block_group_cache_lock);
113         p = &info->block_group_cache_tree.rb_node;
114
115         while (*p) {
116                 parent = *p;
117                 cache = rb_entry(parent, struct btrfs_block_group_cache,
118                                  cache_node);
119                 if (block_group->key.objectid < cache->key.objectid) {
120                         p = &(*p)->rb_left;
121                 } else if (block_group->key.objectid > cache->key.objectid) {
122                         p = &(*p)->rb_right;
123                 } else {
124                         spin_unlock(&info->block_group_cache_lock);
125                         return -EEXIST;
126                 }
127         }
128
129         rb_link_node(&block_group->cache_node, parent, p);
130         rb_insert_color(&block_group->cache_node,
131                         &info->block_group_cache_tree);
132         spin_unlock(&info->block_group_cache_lock);
133
134         return 0;
135 }
136
137 /*
138  * This will return the block group at or after bytenr if contains is 0, else
139  * it will return the block group that contains the bytenr
140  */
141 static struct btrfs_block_group_cache *
142 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
143                               int contains)
144 {
145         struct btrfs_block_group_cache *cache, *ret = NULL;
146         struct rb_node *n;
147         u64 end, start;
148
149         spin_lock(&info->block_group_cache_lock);
150         n = info->block_group_cache_tree.rb_node;
151
152         while (n) {
153                 cache = rb_entry(n, struct btrfs_block_group_cache,
154                                  cache_node);
155                 end = cache->key.objectid + cache->key.offset - 1;
156                 start = cache->key.objectid;
157
158                 if (bytenr < start) {
159                         if (!contains && (!ret || start < ret->key.objectid))
160                                 ret = cache;
161                         n = n->rb_left;
162                 } else if (bytenr > start) {
163                         if (contains && bytenr <= end) {
164                                 ret = cache;
165                                 break;
166                         }
167                         n = n->rb_right;
168                 } else {
169                         ret = cache;
170                         break;
171                 }
172         }
173         if (ret)
174                 btrfs_get_block_group(ret);
175         spin_unlock(&info->block_group_cache_lock);
176
177         return ret;
178 }
179
180 static int add_excluded_extent(struct btrfs_root *root,
181                                u64 start, u64 num_bytes)
182 {
183         u64 end = start + num_bytes - 1;
184         set_extent_bits(&root->fs_info->freed_extents[0],
185                         start, end, EXTENT_UPTODATE, GFP_NOFS);
186         set_extent_bits(&root->fs_info->freed_extents[1],
187                         start, end, EXTENT_UPTODATE, GFP_NOFS);
188         return 0;
189 }
190
191 static void free_excluded_extents(struct btrfs_root *root,
192                                   struct btrfs_block_group_cache *cache)
193 {
194         u64 start, end;
195
196         start = cache->key.objectid;
197         end = start + cache->key.offset - 1;
198
199         clear_extent_bits(&root->fs_info->freed_extents[0],
200                           start, end, EXTENT_UPTODATE, GFP_NOFS);
201         clear_extent_bits(&root->fs_info->freed_extents[1],
202                           start, end, EXTENT_UPTODATE, GFP_NOFS);
203 }
204
205 static int exclude_super_stripes(struct btrfs_root *root,
206                                  struct btrfs_block_group_cache *cache)
207 {
208         u64 bytenr;
209         u64 *logical;
210         int stripe_len;
211         int i, nr, ret;
212
213         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
214                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
215                 cache->bytes_super += stripe_len;
216                 ret = add_excluded_extent(root, cache->key.objectid,
217                                           stripe_len);
218                 BUG_ON(ret);
219         }
220
221         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
222                 bytenr = btrfs_sb_offset(i);
223                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
224                                        cache->key.objectid, bytenr,
225                                        0, &logical, &nr, &stripe_len);
226                 BUG_ON(ret);
227
228                 while (nr--) {
229                         cache->bytes_super += stripe_len;
230                         ret = add_excluded_extent(root, logical[nr],
231                                                   stripe_len);
232                         BUG_ON(ret);
233                 }
234
235                 kfree(logical);
236         }
237         return 0;
238 }
239
240 static struct btrfs_caching_control *
241 get_caching_control(struct btrfs_block_group_cache *cache)
242 {
243         struct btrfs_caching_control *ctl;
244
245         spin_lock(&cache->lock);
246         if (cache->cached != BTRFS_CACHE_STARTED) {
247                 spin_unlock(&cache->lock);
248                 return NULL;
249         }
250
251         ctl = cache->caching_ctl;
252         atomic_inc(&ctl->count);
253         spin_unlock(&cache->lock);
254         return ctl;
255 }
256
257 static void put_caching_control(struct btrfs_caching_control *ctl)
258 {
259         if (atomic_dec_and_test(&ctl->count))
260                 kfree(ctl);
261 }
262
263 /*
264  * this is only called by cache_block_group, since we could have freed extents
265  * we need to check the pinned_extents for any extents that can't be used yet
266  * since their free space will be released as soon as the transaction commits.
267  */
268 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
269                               struct btrfs_fs_info *info, u64 start, u64 end)
270 {
271         u64 extent_start, extent_end, size, total_added = 0;
272         int ret;
273
274         while (start < end) {
275                 ret = find_first_extent_bit(info->pinned_extents, start,
276                                             &extent_start, &extent_end,
277                                             EXTENT_DIRTY | EXTENT_UPTODATE);
278                 if (ret)
279                         break;
280
281                 if (extent_start <= start) {
282                         start = extent_end + 1;
283                 } else if (extent_start > start && extent_start < end) {
284                         size = extent_start - start;
285                         total_added += size;
286                         ret = btrfs_add_free_space(block_group, start,
287                                                    size);
288                         BUG_ON(ret);
289                         start = extent_end + 1;
290                 } else {
291                         break;
292                 }
293         }
294
295         if (start < end) {
296                 size = end - start;
297                 total_added += size;
298                 ret = btrfs_add_free_space(block_group, start, size);
299                 BUG_ON(ret);
300         }
301
302         return total_added;
303 }
304
305 static int caching_kthread(void *data)
306 {
307         struct btrfs_block_group_cache *block_group = data;
308         struct btrfs_fs_info *fs_info = block_group->fs_info;
309         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
310         struct btrfs_root *extent_root = fs_info->extent_root;
311         struct btrfs_path *path;
312         struct extent_buffer *leaf;
313         struct btrfs_key key;
314         u64 total_found = 0;
315         u64 last = 0;
316         u32 nritems;
317         int ret = 0;
318
319         path = btrfs_alloc_path();
320         if (!path)
321                 return -ENOMEM;
322
323         exclude_super_stripes(extent_root, block_group);
324         spin_lock(&block_group->space_info->lock);
325         block_group->space_info->bytes_readonly += block_group->bytes_super;
326         spin_unlock(&block_group->space_info->lock);
327
328         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329
330         /*
331          * We don't want to deadlock with somebody trying to allocate a new
332          * extent for the extent root while also trying to search the extent
333          * root to add free space.  So we skip locking and search the commit
334          * root, since its read-only
335          */
336         path->skip_locking = 1;
337         path->search_commit_root = 1;
338         path->reada = 2;
339
340         key.objectid = last;
341         key.offset = 0;
342         key.type = BTRFS_EXTENT_ITEM_KEY;
343 again:
344         mutex_lock(&caching_ctl->mutex);
345         /* need to make sure the commit_root doesn't disappear */
346         down_read(&fs_info->extent_commit_sem);
347
348         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
349         if (ret < 0)
350                 goto err;
351
352         leaf = path->nodes[0];
353         nritems = btrfs_header_nritems(leaf);
354
355         while (1) {
356                 smp_mb();
357                 if (fs_info->closing > 1) {
358                         last = (u64)-1;
359                         break;
360                 }
361
362                 if (path->slots[0] < nritems) {
363                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
364                 } else {
365                         ret = find_next_key(path, 0, &key);
366                         if (ret)
367                                 break;
368
369                         caching_ctl->progress = last;
370                         btrfs_release_path(extent_root, path);
371                         up_read(&fs_info->extent_commit_sem);
372                         mutex_unlock(&caching_ctl->mutex);
373                         if (btrfs_transaction_in_commit(fs_info))
374                                 schedule_timeout(1);
375                         else
376                                 cond_resched();
377                         goto again;
378                 }
379
380                 if (key.objectid < block_group->key.objectid) {
381                         path->slots[0]++;
382                         continue;
383                 }
384
385                 if (key.objectid >= block_group->key.objectid +
386                     block_group->key.offset)
387                         break;
388
389                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
390                         total_found += add_new_free_space(block_group,
391                                                           fs_info, last,
392                                                           key.objectid);
393                         last = key.objectid + key.offset;
394
395                         if (total_found > (1024 * 1024 * 2)) {
396                                 total_found = 0;
397                                 wake_up(&caching_ctl->wait);
398                         }
399                 }
400                 path->slots[0]++;
401         }
402         ret = 0;
403
404         total_found += add_new_free_space(block_group, fs_info, last,
405                                           block_group->key.objectid +
406                                           block_group->key.offset);
407         caching_ctl->progress = (u64)-1;
408
409         spin_lock(&block_group->lock);
410         block_group->caching_ctl = NULL;
411         block_group->cached = BTRFS_CACHE_FINISHED;
412         spin_unlock(&block_group->lock);
413
414 err:
415         btrfs_free_path(path);
416         up_read(&fs_info->extent_commit_sem);
417
418         free_excluded_extents(extent_root, block_group);
419
420         mutex_unlock(&caching_ctl->mutex);
421         wake_up(&caching_ctl->wait);
422
423         put_caching_control(caching_ctl);
424         atomic_dec(&block_group->space_info->caching_threads);
425         btrfs_put_block_group(block_group);
426
427         return 0;
428 }
429
430 static int cache_block_group(struct btrfs_block_group_cache *cache)
431 {
432         struct btrfs_fs_info *fs_info = cache->fs_info;
433         struct btrfs_caching_control *caching_ctl;
434         struct task_struct *tsk;
435         int ret = 0;
436
437         smp_mb();
438         if (cache->cached != BTRFS_CACHE_NO)
439                 return 0;
440
441         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
442         BUG_ON(!caching_ctl);
443
444         INIT_LIST_HEAD(&caching_ctl->list);
445         mutex_init(&caching_ctl->mutex);
446         init_waitqueue_head(&caching_ctl->wait);
447         caching_ctl->block_group = cache;
448         caching_ctl->progress = cache->key.objectid;
449         /* one for caching kthread, one for caching block group list */
450         atomic_set(&caching_ctl->count, 2);
451
452         spin_lock(&cache->lock);
453         if (cache->cached != BTRFS_CACHE_NO) {
454                 spin_unlock(&cache->lock);
455                 kfree(caching_ctl);
456                 return 0;
457         }
458         cache->caching_ctl = caching_ctl;
459         cache->cached = BTRFS_CACHE_STARTED;
460         spin_unlock(&cache->lock);
461
462         down_write(&fs_info->extent_commit_sem);
463         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
464         up_write(&fs_info->extent_commit_sem);
465
466         atomic_inc(&cache->space_info->caching_threads);
467         btrfs_get_block_group(cache);
468
469         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
470                           cache->key.objectid);
471         if (IS_ERR(tsk)) {
472                 ret = PTR_ERR(tsk);
473                 printk(KERN_ERR "error running thread %d\n", ret);
474                 BUG();
475         }
476
477         return ret;
478 }
479
480 /*
481  * return the block group that starts at or after bytenr
482  */
483 static struct btrfs_block_group_cache *
484 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
485 {
486         struct btrfs_block_group_cache *cache;
487
488         cache = block_group_cache_tree_search(info, bytenr, 0);
489
490         return cache;
491 }
492
493 /*
494  * return the block group that contains the given bytenr
495  */
496 struct btrfs_block_group_cache *btrfs_lookup_block_group(
497                                                  struct btrfs_fs_info *info,
498                                                  u64 bytenr)
499 {
500         struct btrfs_block_group_cache *cache;
501
502         cache = block_group_cache_tree_search(info, bytenr, 1);
503
504         return cache;
505 }
506
507 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
508                                                   u64 flags)
509 {
510         struct list_head *head = &info->space_info;
511         struct btrfs_space_info *found;
512
513         flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
514                  BTRFS_BLOCK_GROUP_METADATA;
515
516         rcu_read_lock();
517         list_for_each_entry_rcu(found, head, list) {
518                 if (found->flags == flags) {
519                         rcu_read_unlock();
520                         return found;
521                 }
522         }
523         rcu_read_unlock();
524         return NULL;
525 }
526
527 /*
528  * after adding space to the filesystem, we need to clear the full flags
529  * on all the space infos.
530  */
531 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
532 {
533         struct list_head *head = &info->space_info;
534         struct btrfs_space_info *found;
535
536         rcu_read_lock();
537         list_for_each_entry_rcu(found, head, list)
538                 found->full = 0;
539         rcu_read_unlock();
540 }
541
542 static u64 div_factor(u64 num, int factor)
543 {
544         if (factor == 10)
545                 return num;
546         num *= factor;
547         do_div(num, 10);
548         return num;
549 }
550
551 u64 btrfs_find_block_group(struct btrfs_root *root,
552                            u64 search_start, u64 search_hint, int owner)
553 {
554         struct btrfs_block_group_cache *cache;
555         u64 used;
556         u64 last = max(search_hint, search_start);
557         u64 group_start = 0;
558         int full_search = 0;
559         int factor = 9;
560         int wrapped = 0;
561 again:
562         while (1) {
563                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
564                 if (!cache)
565                         break;
566
567                 spin_lock(&cache->lock);
568                 last = cache->key.objectid + cache->key.offset;
569                 used = btrfs_block_group_used(&cache->item);
570
571                 if ((full_search || !cache->ro) &&
572                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
573                         if (used + cache->pinned + cache->reserved <
574                             div_factor(cache->key.offset, factor)) {
575                                 group_start = cache->key.objectid;
576                                 spin_unlock(&cache->lock);
577                                 btrfs_put_block_group(cache);
578                                 goto found;
579                         }
580                 }
581                 spin_unlock(&cache->lock);
582                 btrfs_put_block_group(cache);
583                 cond_resched();
584         }
585         if (!wrapped) {
586                 last = search_start;
587                 wrapped = 1;
588                 goto again;
589         }
590         if (!full_search && factor < 10) {
591                 last = search_start;
592                 full_search = 1;
593                 factor = 10;
594                 goto again;
595         }
596 found:
597         return group_start;
598 }
599
600 /* simple helper to search for an existing extent at a given offset */
601 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
602 {
603         int ret;
604         struct btrfs_key key;
605         struct btrfs_path *path;
606
607         path = btrfs_alloc_path();
608         BUG_ON(!path);
609         key.objectid = start;
610         key.offset = len;
611         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
612         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
613                                 0, 0);
614         btrfs_free_path(path);
615         return ret;
616 }
617
618 /*
619  * Back reference rules.  Back refs have three main goals:
620  *
621  * 1) differentiate between all holders of references to an extent so that
622  *    when a reference is dropped we can make sure it was a valid reference
623  *    before freeing the extent.
624  *
625  * 2) Provide enough information to quickly find the holders of an extent
626  *    if we notice a given block is corrupted or bad.
627  *
628  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
629  *    maintenance.  This is actually the same as #2, but with a slightly
630  *    different use case.
631  *
632  * There are two kinds of back refs. The implicit back refs is optimized
633  * for pointers in non-shared tree blocks. For a given pointer in a block,
634  * back refs of this kind provide information about the block's owner tree
635  * and the pointer's key. These information allow us to find the block by
636  * b-tree searching. The full back refs is for pointers in tree blocks not
637  * referenced by their owner trees. The location of tree block is recorded
638  * in the back refs. Actually the full back refs is generic, and can be
639  * used in all cases the implicit back refs is used. The major shortcoming
640  * of the full back refs is its overhead. Every time a tree block gets
641  * COWed, we have to update back refs entry for all pointers in it.
642  *
643  * For a newly allocated tree block, we use implicit back refs for
644  * pointers in it. This means most tree related operations only involve
645  * implicit back refs. For a tree block created in old transaction, the
646  * only way to drop a reference to it is COW it. So we can detect the
647  * event that tree block loses its owner tree's reference and do the
648  * back refs conversion.
649  *
650  * When a tree block is COW'd through a tree, there are four cases:
651  *
652  * The reference count of the block is one and the tree is the block's
653  * owner tree. Nothing to do in this case.
654  *
655  * The reference count of the block is one and the tree is not the
656  * block's owner tree. In this case, full back refs is used for pointers
657  * in the block. Remove these full back refs, add implicit back refs for
658  * every pointers in the new block.
659  *
660  * The reference count of the block is greater than one and the tree is
661  * the block's owner tree. In this case, implicit back refs is used for
662  * pointers in the block. Add full back refs for every pointers in the
663  * block, increase lower level extents' reference counts. The original
664  * implicit back refs are entailed to the new block.
665  *
666  * The reference count of the block is greater than one and the tree is
667  * not the block's owner tree. Add implicit back refs for every pointer in
668  * the new block, increase lower level extents' reference count.
669  *
670  * Back Reference Key composing:
671  *
672  * The key objectid corresponds to the first byte in the extent,
673  * The key type is used to differentiate between types of back refs.
674  * There are different meanings of the key offset for different types
675  * of back refs.
676  *
677  * File extents can be referenced by:
678  *
679  * - multiple snapshots, subvolumes, or different generations in one subvol
680  * - different files inside a single subvolume
681  * - different offsets inside a file (bookend extents in file.c)
682  *
683  * The extent ref structure for the implicit back refs has fields for:
684  *
685  * - Objectid of the subvolume root
686  * - objectid of the file holding the reference
687  * - original offset in the file
688  * - how many bookend extents
689  *
690  * The key offset for the implicit back refs is hash of the first
691  * three fields.
692  *
693  * The extent ref structure for the full back refs has field for:
694  *
695  * - number of pointers in the tree leaf
696  *
697  * The key offset for the implicit back refs is the first byte of
698  * the tree leaf
699  *
700  * When a file extent is allocated, The implicit back refs is used.
701  * the fields are filled in:
702  *
703  *     (root_key.objectid, inode objectid, offset in file, 1)
704  *
705  * When a file extent is removed file truncation, we find the
706  * corresponding implicit back refs and check the following fields:
707  *
708  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
709  *
710  * Btree extents can be referenced by:
711  *
712  * - Different subvolumes
713  *
714  * Both the implicit back refs and the full back refs for tree blocks
715  * only consist of key. The key offset for the implicit back refs is
716  * objectid of block's owner tree. The key offset for the full back refs
717  * is the first byte of parent block.
718  *
719  * When implicit back refs is used, information about the lowest key and
720  * level of the tree block are required. These information are stored in
721  * tree block info structure.
722  */
723
724 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
725 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
726                                   struct btrfs_root *root,
727                                   struct btrfs_path *path,
728                                   u64 owner, u32 extra_size)
729 {
730         struct btrfs_extent_item *item;
731         struct btrfs_extent_item_v0 *ei0;
732         struct btrfs_extent_ref_v0 *ref0;
733         struct btrfs_tree_block_info *bi;
734         struct extent_buffer *leaf;
735         struct btrfs_key key;
736         struct btrfs_key found_key;
737         u32 new_size = sizeof(*item);
738         u64 refs;
739         int ret;
740
741         leaf = path->nodes[0];
742         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
743
744         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
745         ei0 = btrfs_item_ptr(leaf, path->slots[0],
746                              struct btrfs_extent_item_v0);
747         refs = btrfs_extent_refs_v0(leaf, ei0);
748
749         if (owner == (u64)-1) {
750                 while (1) {
751                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
752                                 ret = btrfs_next_leaf(root, path);
753                                 if (ret < 0)
754                                         return ret;
755                                 BUG_ON(ret > 0);
756                                 leaf = path->nodes[0];
757                         }
758                         btrfs_item_key_to_cpu(leaf, &found_key,
759                                               path->slots[0]);
760                         BUG_ON(key.objectid != found_key.objectid);
761                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
762                                 path->slots[0]++;
763                                 continue;
764                         }
765                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
766                                               struct btrfs_extent_ref_v0);
767                         owner = btrfs_ref_objectid_v0(leaf, ref0);
768                         break;
769                 }
770         }
771         btrfs_release_path(root, path);
772
773         if (owner < BTRFS_FIRST_FREE_OBJECTID)
774                 new_size += sizeof(*bi);
775
776         new_size -= sizeof(*ei0);
777         ret = btrfs_search_slot(trans, root, &key, path,
778                                 new_size + extra_size, 1);
779         if (ret < 0)
780                 return ret;
781         BUG_ON(ret);
782
783         ret = btrfs_extend_item(trans, root, path, new_size);
784         BUG_ON(ret);
785
786         leaf = path->nodes[0];
787         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
788         btrfs_set_extent_refs(leaf, item, refs);
789         /* FIXME: get real generation */
790         btrfs_set_extent_generation(leaf, item, 0);
791         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
792                 btrfs_set_extent_flags(leaf, item,
793                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
794                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
795                 bi = (struct btrfs_tree_block_info *)(item + 1);
796                 /* FIXME: get first key of the block */
797                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
798                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
799         } else {
800                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
801         }
802         btrfs_mark_buffer_dirty(leaf);
803         return 0;
804 }
805 #endif
806
807 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
808 {
809         u32 high_crc = ~(u32)0;
810         u32 low_crc = ~(u32)0;
811         __le64 lenum;
812
813         lenum = cpu_to_le64(root_objectid);
814         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
815         lenum = cpu_to_le64(owner);
816         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
817         lenum = cpu_to_le64(offset);
818         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
819
820         return ((u64)high_crc << 31) ^ (u64)low_crc;
821 }
822
823 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
824                                      struct btrfs_extent_data_ref *ref)
825 {
826         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
827                                     btrfs_extent_data_ref_objectid(leaf, ref),
828                                     btrfs_extent_data_ref_offset(leaf, ref));
829 }
830
831 static int match_extent_data_ref(struct extent_buffer *leaf,
832                                  struct btrfs_extent_data_ref *ref,
833                                  u64 root_objectid, u64 owner, u64 offset)
834 {
835         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
836             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
837             btrfs_extent_data_ref_offset(leaf, ref) != offset)
838                 return 0;
839         return 1;
840 }
841
842 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
843                                            struct btrfs_root *root,
844                                            struct btrfs_path *path,
845                                            u64 bytenr, u64 parent,
846                                            u64 root_objectid,
847                                            u64 owner, u64 offset)
848 {
849         struct btrfs_key key;
850         struct btrfs_extent_data_ref *ref;
851         struct extent_buffer *leaf;
852         u32 nritems;
853         int ret;
854         int recow;
855         int err = -ENOENT;
856
857         key.objectid = bytenr;
858         if (parent) {
859                 key.type = BTRFS_SHARED_DATA_REF_KEY;
860                 key.offset = parent;
861         } else {
862                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
863                 key.offset = hash_extent_data_ref(root_objectid,
864                                                   owner, offset);
865         }
866 again:
867         recow = 0;
868         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
869         if (ret < 0) {
870                 err = ret;
871                 goto fail;
872         }
873
874         if (parent) {
875                 if (!ret)
876                         return 0;
877 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
878                 key.type = BTRFS_EXTENT_REF_V0_KEY;
879                 btrfs_release_path(root, path);
880                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
881                 if (ret < 0) {
882                         err = ret;
883                         goto fail;
884                 }
885                 if (!ret)
886                         return 0;
887 #endif
888                 goto fail;
889         }
890
891         leaf = path->nodes[0];
892         nritems = btrfs_header_nritems(leaf);
893         while (1) {
894                 if (path->slots[0] >= nritems) {
895                         ret = btrfs_next_leaf(root, path);
896                         if (ret < 0)
897                                 err = ret;
898                         if (ret)
899                                 goto fail;
900
901                         leaf = path->nodes[0];
902                         nritems = btrfs_header_nritems(leaf);
903                         recow = 1;
904                 }
905
906                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
907                 if (key.objectid != bytenr ||
908                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
909                         goto fail;
910
911                 ref = btrfs_item_ptr(leaf, path->slots[0],
912                                      struct btrfs_extent_data_ref);
913
914                 if (match_extent_data_ref(leaf, ref, root_objectid,
915                                           owner, offset)) {
916                         if (recow) {
917                                 btrfs_release_path(root, path);
918                                 goto again;
919                         }
920                         err = 0;
921                         break;
922                 }
923                 path->slots[0]++;
924         }
925 fail:
926         return err;
927 }
928
929 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
930                                            struct btrfs_root *root,
931                                            struct btrfs_path *path,
932                                            u64 bytenr, u64 parent,
933                                            u64 root_objectid, u64 owner,
934                                            u64 offset, int refs_to_add)
935 {
936         struct btrfs_key key;
937         struct extent_buffer *leaf;
938         u32 size;
939         u32 num_refs;
940         int ret;
941
942         key.objectid = bytenr;
943         if (parent) {
944                 key.type = BTRFS_SHARED_DATA_REF_KEY;
945                 key.offset = parent;
946                 size = sizeof(struct btrfs_shared_data_ref);
947         } else {
948                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
949                 key.offset = hash_extent_data_ref(root_objectid,
950                                                   owner, offset);
951                 size = sizeof(struct btrfs_extent_data_ref);
952         }
953
954         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
955         if (ret && ret != -EEXIST)
956                 goto fail;
957
958         leaf = path->nodes[0];
959         if (parent) {
960                 struct btrfs_shared_data_ref *ref;
961                 ref = btrfs_item_ptr(leaf, path->slots[0],
962                                      struct btrfs_shared_data_ref);
963                 if (ret == 0) {
964                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
965                 } else {
966                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
967                         num_refs += refs_to_add;
968                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
969                 }
970         } else {
971                 struct btrfs_extent_data_ref *ref;
972                 while (ret == -EEXIST) {
973                         ref = btrfs_item_ptr(leaf, path->slots[0],
974                                              struct btrfs_extent_data_ref);
975                         if (match_extent_data_ref(leaf, ref, root_objectid,
976                                                   owner, offset))
977                                 break;
978                         btrfs_release_path(root, path);
979                         key.offset++;
980                         ret = btrfs_insert_empty_item(trans, root, path, &key,
981                                                       size);
982                         if (ret && ret != -EEXIST)
983                                 goto fail;
984
985                         leaf = path->nodes[0];
986                 }
987                 ref = btrfs_item_ptr(leaf, path->slots[0],
988                                      struct btrfs_extent_data_ref);
989                 if (ret == 0) {
990                         btrfs_set_extent_data_ref_root(leaf, ref,
991                                                        root_objectid);
992                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
993                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
994                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
995                 } else {
996                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
997                         num_refs += refs_to_add;
998                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
999                 }
1000         }
1001         btrfs_mark_buffer_dirty(leaf);
1002         ret = 0;
1003 fail:
1004         btrfs_release_path(root, path);
1005         return ret;
1006 }
1007
1008 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1009                                            struct btrfs_root *root,
1010                                            struct btrfs_path *path,
1011                                            int refs_to_drop)
1012 {
1013         struct btrfs_key key;
1014         struct btrfs_extent_data_ref *ref1 = NULL;
1015         struct btrfs_shared_data_ref *ref2 = NULL;
1016         struct extent_buffer *leaf;
1017         u32 num_refs = 0;
1018         int ret = 0;
1019
1020         leaf = path->nodes[0];
1021         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1022
1023         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1024                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1025                                       struct btrfs_extent_data_ref);
1026                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1027         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1028                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1029                                       struct btrfs_shared_data_ref);
1030                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1031 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1032         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1033                 struct btrfs_extent_ref_v0 *ref0;
1034                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1035                                       struct btrfs_extent_ref_v0);
1036                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1037 #endif
1038         } else {
1039                 BUG();
1040         }
1041
1042         BUG_ON(num_refs < refs_to_drop);
1043         num_refs -= refs_to_drop;
1044
1045         if (num_refs == 0) {
1046                 ret = btrfs_del_item(trans, root, path);
1047         } else {
1048                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1049                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1050                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1051                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1052 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1053                 else {
1054                         struct btrfs_extent_ref_v0 *ref0;
1055                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1056                                         struct btrfs_extent_ref_v0);
1057                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1058                 }
1059 #endif
1060                 btrfs_mark_buffer_dirty(leaf);
1061         }
1062         return ret;
1063 }
1064
1065 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1066                                           struct btrfs_path *path,
1067                                           struct btrfs_extent_inline_ref *iref)
1068 {
1069         struct btrfs_key key;
1070         struct extent_buffer *leaf;
1071         struct btrfs_extent_data_ref *ref1;
1072         struct btrfs_shared_data_ref *ref2;
1073         u32 num_refs = 0;
1074
1075         leaf = path->nodes[0];
1076         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1077         if (iref) {
1078                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1079                     BTRFS_EXTENT_DATA_REF_KEY) {
1080                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1081                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1082                 } else {
1083                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1084                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1085                 }
1086         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1087                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1088                                       struct btrfs_extent_data_ref);
1089                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1090         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1091                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1092                                       struct btrfs_shared_data_ref);
1093                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1094 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1095         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1096                 struct btrfs_extent_ref_v0 *ref0;
1097                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1098                                       struct btrfs_extent_ref_v0);
1099                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1100 #endif
1101         } else {
1102                 WARN_ON(1);
1103         }
1104         return num_refs;
1105 }
1106
1107 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1108                                           struct btrfs_root *root,
1109                                           struct btrfs_path *path,
1110                                           u64 bytenr, u64 parent,
1111                                           u64 root_objectid)
1112 {
1113         struct btrfs_key key;
1114         int ret;
1115
1116         key.objectid = bytenr;
1117         if (parent) {
1118                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1119                 key.offset = parent;
1120         } else {
1121                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1122                 key.offset = root_objectid;
1123         }
1124
1125         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1126         if (ret > 0)
1127                 ret = -ENOENT;
1128 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1129         if (ret == -ENOENT && parent) {
1130                 btrfs_release_path(root, path);
1131                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1132                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1133                 if (ret > 0)
1134                         ret = -ENOENT;
1135         }
1136 #endif
1137         return ret;
1138 }
1139
1140 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1141                                           struct btrfs_root *root,
1142                                           struct btrfs_path *path,
1143                                           u64 bytenr, u64 parent,
1144                                           u64 root_objectid)
1145 {
1146         struct btrfs_key key;
1147         int ret;
1148
1149         key.objectid = bytenr;
1150         if (parent) {
1151                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1152                 key.offset = parent;
1153         } else {
1154                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1155                 key.offset = root_objectid;
1156         }
1157
1158         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1159         btrfs_release_path(root, path);
1160         return ret;
1161 }
1162
1163 static inline int extent_ref_type(u64 parent, u64 owner)
1164 {
1165         int type;
1166         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1167                 if (parent > 0)
1168                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1169                 else
1170                         type = BTRFS_TREE_BLOCK_REF_KEY;
1171         } else {
1172                 if (parent > 0)
1173                         type = BTRFS_SHARED_DATA_REF_KEY;
1174                 else
1175                         type = BTRFS_EXTENT_DATA_REF_KEY;
1176         }
1177         return type;
1178 }
1179
1180 static int find_next_key(struct btrfs_path *path, int level,
1181                          struct btrfs_key *key)
1182
1183 {
1184         for (; level < BTRFS_MAX_LEVEL; level++) {
1185                 if (!path->nodes[level])
1186                         break;
1187                 if (path->slots[level] + 1 >=
1188                     btrfs_header_nritems(path->nodes[level]))
1189                         continue;
1190                 if (level == 0)
1191                         btrfs_item_key_to_cpu(path->nodes[level], key,
1192                                               path->slots[level] + 1);
1193                 else
1194                         btrfs_node_key_to_cpu(path->nodes[level], key,
1195                                               path->slots[level] + 1);
1196                 return 0;
1197         }
1198         return 1;
1199 }
1200
1201 /*
1202  * look for inline back ref. if back ref is found, *ref_ret is set
1203  * to the address of inline back ref, and 0 is returned.
1204  *
1205  * if back ref isn't found, *ref_ret is set to the address where it
1206  * should be inserted, and -ENOENT is returned.
1207  *
1208  * if insert is true and there are too many inline back refs, the path
1209  * points to the extent item, and -EAGAIN is returned.
1210  *
1211  * NOTE: inline back refs are ordered in the same way that back ref
1212  *       items in the tree are ordered.
1213  */
1214 static noinline_for_stack
1215 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1216                                  struct btrfs_root *root,
1217                                  struct btrfs_path *path,
1218                                  struct btrfs_extent_inline_ref **ref_ret,
1219                                  u64 bytenr, u64 num_bytes,
1220                                  u64 parent, u64 root_objectid,
1221                                  u64 owner, u64 offset, int insert)
1222 {
1223         struct btrfs_key key;
1224         struct extent_buffer *leaf;
1225         struct btrfs_extent_item *ei;
1226         struct btrfs_extent_inline_ref *iref;
1227         u64 flags;
1228         u64 item_size;
1229         unsigned long ptr;
1230         unsigned long end;
1231         int extra_size;
1232         int type;
1233         int want;
1234         int ret;
1235         int err = 0;
1236
1237         key.objectid = bytenr;
1238         key.type = BTRFS_EXTENT_ITEM_KEY;
1239         key.offset = num_bytes;
1240
1241         want = extent_ref_type(parent, owner);
1242         if (insert) {
1243                 extra_size = btrfs_extent_inline_ref_size(want);
1244                 path->keep_locks = 1;
1245         } else
1246                 extra_size = -1;
1247         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1248         if (ret < 0) {
1249                 err = ret;
1250                 goto out;
1251         }
1252         BUG_ON(ret);
1253
1254         leaf = path->nodes[0];
1255         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1256 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1257         if (item_size < sizeof(*ei)) {
1258                 if (!insert) {
1259                         err = -ENOENT;
1260                         goto out;
1261                 }
1262                 ret = convert_extent_item_v0(trans, root, path, owner,
1263                                              extra_size);
1264                 if (ret < 0) {
1265                         err = ret;
1266                         goto out;
1267                 }
1268                 leaf = path->nodes[0];
1269                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1270         }
1271 #endif
1272         BUG_ON(item_size < sizeof(*ei));
1273
1274         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1275         flags = btrfs_extent_flags(leaf, ei);
1276
1277         ptr = (unsigned long)(ei + 1);
1278         end = (unsigned long)ei + item_size;
1279
1280         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1281                 ptr += sizeof(struct btrfs_tree_block_info);
1282                 BUG_ON(ptr > end);
1283         } else {
1284                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1285         }
1286
1287         err = -ENOENT;
1288         while (1) {
1289                 if (ptr >= end) {
1290                         WARN_ON(ptr > end);
1291                         break;
1292                 }
1293                 iref = (struct btrfs_extent_inline_ref *)ptr;
1294                 type = btrfs_extent_inline_ref_type(leaf, iref);
1295                 if (want < type)
1296                         break;
1297                 if (want > type) {
1298                         ptr += btrfs_extent_inline_ref_size(type);
1299                         continue;
1300                 }
1301
1302                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1303                         struct btrfs_extent_data_ref *dref;
1304                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1305                         if (match_extent_data_ref(leaf, dref, root_objectid,
1306                                                   owner, offset)) {
1307                                 err = 0;
1308                                 break;
1309                         }
1310                         if (hash_extent_data_ref_item(leaf, dref) <
1311                             hash_extent_data_ref(root_objectid, owner, offset))
1312                                 break;
1313                 } else {
1314                         u64 ref_offset;
1315                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1316                         if (parent > 0) {
1317                                 if (parent == ref_offset) {
1318                                         err = 0;
1319                                         break;
1320                                 }
1321                                 if (ref_offset < parent)
1322                                         break;
1323                         } else {
1324                                 if (root_objectid == ref_offset) {
1325                                         err = 0;
1326                                         break;
1327                                 }
1328                                 if (ref_offset < root_objectid)
1329                                         break;
1330                         }
1331                 }
1332                 ptr += btrfs_extent_inline_ref_size(type);
1333         }
1334         if (err == -ENOENT && insert) {
1335                 if (item_size + extra_size >=
1336                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1337                         err = -EAGAIN;
1338                         goto out;
1339                 }
1340                 /*
1341                  * To add new inline back ref, we have to make sure
1342                  * there is no corresponding back ref item.
1343                  * For simplicity, we just do not add new inline back
1344                  * ref if there is any kind of item for this block
1345                  */
1346                 if (find_next_key(path, 0, &key) == 0 &&
1347                     key.objectid == bytenr &&
1348                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1349                         err = -EAGAIN;
1350                         goto out;
1351                 }
1352         }
1353         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1354 out:
1355         if (insert) {
1356                 path->keep_locks = 0;
1357                 btrfs_unlock_up_safe(path, 1);
1358         }
1359         return err;
1360 }
1361
1362 /*
1363  * helper to add new inline back ref
1364  */
1365 static noinline_for_stack
1366 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1367                                 struct btrfs_root *root,
1368                                 struct btrfs_path *path,
1369                                 struct btrfs_extent_inline_ref *iref,
1370                                 u64 parent, u64 root_objectid,
1371                                 u64 owner, u64 offset, int refs_to_add,
1372                                 struct btrfs_delayed_extent_op *extent_op)
1373 {
1374         struct extent_buffer *leaf;
1375         struct btrfs_extent_item *ei;
1376         unsigned long ptr;
1377         unsigned long end;
1378         unsigned long item_offset;
1379         u64 refs;
1380         int size;
1381         int type;
1382         int ret;
1383
1384         leaf = path->nodes[0];
1385         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1386         item_offset = (unsigned long)iref - (unsigned long)ei;
1387
1388         type = extent_ref_type(parent, owner);
1389         size = btrfs_extent_inline_ref_size(type);
1390
1391         ret = btrfs_extend_item(trans, root, path, size);
1392         BUG_ON(ret);
1393
1394         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1395         refs = btrfs_extent_refs(leaf, ei);
1396         refs += refs_to_add;
1397         btrfs_set_extent_refs(leaf, ei, refs);
1398         if (extent_op)
1399                 __run_delayed_extent_op(extent_op, leaf, ei);
1400
1401         ptr = (unsigned long)ei + item_offset;
1402         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1403         if (ptr < end - size)
1404                 memmove_extent_buffer(leaf, ptr + size, ptr,
1405                                       end - size - ptr);
1406
1407         iref = (struct btrfs_extent_inline_ref *)ptr;
1408         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1409         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1410                 struct btrfs_extent_data_ref *dref;
1411                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1412                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1413                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1414                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1415                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1416         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1417                 struct btrfs_shared_data_ref *sref;
1418                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1419                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1420                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1421         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1422                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1423         } else {
1424                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1425         }
1426         btrfs_mark_buffer_dirty(leaf);
1427         return 0;
1428 }
1429
1430 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1431                                  struct btrfs_root *root,
1432                                  struct btrfs_path *path,
1433                                  struct btrfs_extent_inline_ref **ref_ret,
1434                                  u64 bytenr, u64 num_bytes, u64 parent,
1435                                  u64 root_objectid, u64 owner, u64 offset)
1436 {
1437         int ret;
1438
1439         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1440                                            bytenr, num_bytes, parent,
1441                                            root_objectid, owner, offset, 0);
1442         if (ret != -ENOENT)
1443                 return ret;
1444
1445         btrfs_release_path(root, path);
1446         *ref_ret = NULL;
1447
1448         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1449                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1450                                             root_objectid);
1451         } else {
1452                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1453                                              root_objectid, owner, offset);
1454         }
1455         return ret;
1456 }
1457
1458 /*
1459  * helper to update/remove inline back ref
1460  */
1461 static noinline_for_stack
1462 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1463                                  struct btrfs_root *root,
1464                                  struct btrfs_path *path,
1465                                  struct btrfs_extent_inline_ref *iref,
1466                                  int refs_to_mod,
1467                                  struct btrfs_delayed_extent_op *extent_op)
1468 {
1469         struct extent_buffer *leaf;
1470         struct btrfs_extent_item *ei;
1471         struct btrfs_extent_data_ref *dref = NULL;
1472         struct btrfs_shared_data_ref *sref = NULL;
1473         unsigned long ptr;
1474         unsigned long end;
1475         u32 item_size;
1476         int size;
1477         int type;
1478         int ret;
1479         u64 refs;
1480
1481         leaf = path->nodes[0];
1482         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1483         refs = btrfs_extent_refs(leaf, ei);
1484         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1485         refs += refs_to_mod;
1486         btrfs_set_extent_refs(leaf, ei, refs);
1487         if (extent_op)
1488                 __run_delayed_extent_op(extent_op, leaf, ei);
1489
1490         type = btrfs_extent_inline_ref_type(leaf, iref);
1491
1492         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1493                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1494                 refs = btrfs_extent_data_ref_count(leaf, dref);
1495         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1496                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1497                 refs = btrfs_shared_data_ref_count(leaf, sref);
1498         } else {
1499                 refs = 1;
1500                 BUG_ON(refs_to_mod != -1);
1501         }
1502
1503         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1504         refs += refs_to_mod;
1505
1506         if (refs > 0) {
1507                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1508                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1509                 else
1510                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1511         } else {
1512                 size =  btrfs_extent_inline_ref_size(type);
1513                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1514                 ptr = (unsigned long)iref;
1515                 end = (unsigned long)ei + item_size;
1516                 if (ptr + size < end)
1517                         memmove_extent_buffer(leaf, ptr, ptr + size,
1518                                               end - ptr - size);
1519                 item_size -= size;
1520                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1521                 BUG_ON(ret);
1522         }
1523         btrfs_mark_buffer_dirty(leaf);
1524         return 0;
1525 }
1526
1527 static noinline_for_stack
1528 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1529                                  struct btrfs_root *root,
1530                                  struct btrfs_path *path,
1531                                  u64 bytenr, u64 num_bytes, u64 parent,
1532                                  u64 root_objectid, u64 owner,
1533                                  u64 offset, int refs_to_add,
1534                                  struct btrfs_delayed_extent_op *extent_op)
1535 {
1536         struct btrfs_extent_inline_ref *iref;
1537         int ret;
1538
1539         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1540                                            bytenr, num_bytes, parent,
1541                                            root_objectid, owner, offset, 1);
1542         if (ret == 0) {
1543                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1544                 ret = update_inline_extent_backref(trans, root, path, iref,
1545                                                    refs_to_add, extent_op);
1546         } else if (ret == -ENOENT) {
1547                 ret = setup_inline_extent_backref(trans, root, path, iref,
1548                                                   parent, root_objectid,
1549                                                   owner, offset, refs_to_add,
1550                                                   extent_op);
1551         }
1552         return ret;
1553 }
1554
1555 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1556                                  struct btrfs_root *root,
1557                                  struct btrfs_path *path,
1558                                  u64 bytenr, u64 parent, u64 root_objectid,
1559                                  u64 owner, u64 offset, int refs_to_add)
1560 {
1561         int ret;
1562         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1563                 BUG_ON(refs_to_add != 1);
1564                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1565                                             parent, root_objectid);
1566         } else {
1567                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1568                                              parent, root_objectid,
1569                                              owner, offset, refs_to_add);
1570         }
1571         return ret;
1572 }
1573
1574 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1575                                  struct btrfs_root *root,
1576                                  struct btrfs_path *path,
1577                                  struct btrfs_extent_inline_ref *iref,
1578                                  int refs_to_drop, int is_data)
1579 {
1580         int ret;
1581
1582         BUG_ON(!is_data && refs_to_drop != 1);
1583         if (iref) {
1584                 ret = update_inline_extent_backref(trans, root, path, iref,
1585                                                    -refs_to_drop, NULL);
1586         } else if (is_data) {
1587                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1588         } else {
1589                 ret = btrfs_del_item(trans, root, path);
1590         }
1591         return ret;
1592 }
1593
1594 static void btrfs_issue_discard(struct block_device *bdev,
1595                                 u64 start, u64 len)
1596 {
1597         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1598                              DISCARD_FL_BARRIER);
1599 }
1600
1601 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1602                                 u64 num_bytes)
1603 {
1604         int ret;
1605         u64 map_length = num_bytes;
1606         struct btrfs_multi_bio *multi = NULL;
1607
1608         if (!btrfs_test_opt(root, DISCARD))
1609                 return 0;
1610
1611         /* Tell the block device(s) that the sectors can be discarded */
1612         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1613                               bytenr, &map_length, &multi, 0);
1614         if (!ret) {
1615                 struct btrfs_bio_stripe *stripe = multi->stripes;
1616                 int i;
1617
1618                 if (map_length > num_bytes)
1619                         map_length = num_bytes;
1620
1621                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1622                         btrfs_issue_discard(stripe->dev->bdev,
1623                                             stripe->physical,
1624                                             map_length);
1625                 }
1626                 kfree(multi);
1627         }
1628
1629         return ret;
1630 }
1631
1632 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1633                          struct btrfs_root *root,
1634                          u64 bytenr, u64 num_bytes, u64 parent,
1635                          u64 root_objectid, u64 owner, u64 offset)
1636 {
1637         int ret;
1638         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1639                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1640
1641         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1642                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1643                                         parent, root_objectid, (int)owner,
1644                                         BTRFS_ADD_DELAYED_REF, NULL);
1645         } else {
1646                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1647                                         parent, root_objectid, owner, offset,
1648                                         BTRFS_ADD_DELAYED_REF, NULL);
1649         }
1650         return ret;
1651 }
1652
1653 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1654                                   struct btrfs_root *root,
1655                                   u64 bytenr, u64 num_bytes,
1656                                   u64 parent, u64 root_objectid,
1657                                   u64 owner, u64 offset, int refs_to_add,
1658                                   struct btrfs_delayed_extent_op *extent_op)
1659 {
1660         struct btrfs_path *path;
1661         struct extent_buffer *leaf;
1662         struct btrfs_extent_item *item;
1663         u64 refs;
1664         int ret;
1665         int err = 0;
1666
1667         path = btrfs_alloc_path();
1668         if (!path)
1669                 return -ENOMEM;
1670
1671         path->reada = 1;
1672         path->leave_spinning = 1;
1673         /* this will setup the path even if it fails to insert the back ref */
1674         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1675                                            path, bytenr, num_bytes, parent,
1676                                            root_objectid, owner, offset,
1677                                            refs_to_add, extent_op);
1678         if (ret == 0)
1679                 goto out;
1680
1681         if (ret != -EAGAIN) {
1682                 err = ret;
1683                 goto out;
1684         }
1685
1686         leaf = path->nodes[0];
1687         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1688         refs = btrfs_extent_refs(leaf, item);
1689         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1690         if (extent_op)
1691                 __run_delayed_extent_op(extent_op, leaf, item);
1692
1693         btrfs_mark_buffer_dirty(leaf);
1694         btrfs_release_path(root->fs_info->extent_root, path);
1695
1696         path->reada = 1;
1697         path->leave_spinning = 1;
1698
1699         /* now insert the actual backref */
1700         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1701                                     path, bytenr, parent, root_objectid,
1702                                     owner, offset, refs_to_add);
1703         BUG_ON(ret);
1704 out:
1705         btrfs_free_path(path);
1706         return err;
1707 }
1708
1709 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1710                                 struct btrfs_root *root,
1711                                 struct btrfs_delayed_ref_node *node,
1712                                 struct btrfs_delayed_extent_op *extent_op,
1713                                 int insert_reserved)
1714 {
1715         int ret = 0;
1716         struct btrfs_delayed_data_ref *ref;
1717         struct btrfs_key ins;
1718         u64 parent = 0;
1719         u64 ref_root = 0;
1720         u64 flags = 0;
1721
1722         ins.objectid = node->bytenr;
1723         ins.offset = node->num_bytes;
1724         ins.type = BTRFS_EXTENT_ITEM_KEY;
1725
1726         ref = btrfs_delayed_node_to_data_ref(node);
1727         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1728                 parent = ref->parent;
1729         else
1730                 ref_root = ref->root;
1731
1732         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1733                 if (extent_op) {
1734                         BUG_ON(extent_op->update_key);
1735                         flags |= extent_op->flags_to_set;
1736                 }
1737                 ret = alloc_reserved_file_extent(trans, root,
1738                                                  parent, ref_root, flags,
1739                                                  ref->objectid, ref->offset,
1740                                                  &ins, node->ref_mod);
1741         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1742                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1743                                              node->num_bytes, parent,
1744                                              ref_root, ref->objectid,
1745                                              ref->offset, node->ref_mod,
1746                                              extent_op);
1747         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1748                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1749                                           node->num_bytes, parent,
1750                                           ref_root, ref->objectid,
1751                                           ref->offset, node->ref_mod,
1752                                           extent_op);
1753         } else {
1754                 BUG();
1755         }
1756         return ret;
1757 }
1758
1759 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1760                                     struct extent_buffer *leaf,
1761                                     struct btrfs_extent_item *ei)
1762 {
1763         u64 flags = btrfs_extent_flags(leaf, ei);
1764         if (extent_op->update_flags) {
1765                 flags |= extent_op->flags_to_set;
1766                 btrfs_set_extent_flags(leaf, ei, flags);
1767         }
1768
1769         if (extent_op->update_key) {
1770                 struct btrfs_tree_block_info *bi;
1771                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1772                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1773                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1774         }
1775 }
1776
1777 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1778                                  struct btrfs_root *root,
1779                                  struct btrfs_delayed_ref_node *node,
1780                                  struct btrfs_delayed_extent_op *extent_op)
1781 {
1782         struct btrfs_key key;
1783         struct btrfs_path *path;
1784         struct btrfs_extent_item *ei;
1785         struct extent_buffer *leaf;
1786         u32 item_size;
1787         int ret;
1788         int err = 0;
1789
1790         path = btrfs_alloc_path();
1791         if (!path)
1792                 return -ENOMEM;
1793
1794         key.objectid = node->bytenr;
1795         key.type = BTRFS_EXTENT_ITEM_KEY;
1796         key.offset = node->num_bytes;
1797
1798         path->reada = 1;
1799         path->leave_spinning = 1;
1800         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1801                                 path, 0, 1);
1802         if (ret < 0) {
1803                 err = ret;
1804                 goto out;
1805         }
1806         if (ret > 0) {
1807                 err = -EIO;
1808                 goto out;
1809         }
1810
1811         leaf = path->nodes[0];
1812         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1813 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1814         if (item_size < sizeof(*ei)) {
1815                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1816                                              path, (u64)-1, 0);
1817                 if (ret < 0) {
1818                         err = ret;
1819                         goto out;
1820                 }
1821                 leaf = path->nodes[0];
1822                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1823         }
1824 #endif
1825         BUG_ON(item_size < sizeof(*ei));
1826         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1827         __run_delayed_extent_op(extent_op, leaf, ei);
1828
1829         btrfs_mark_buffer_dirty(leaf);
1830 out:
1831         btrfs_free_path(path);
1832         return err;
1833 }
1834
1835 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1836                                 struct btrfs_root *root,
1837                                 struct btrfs_delayed_ref_node *node,
1838                                 struct btrfs_delayed_extent_op *extent_op,
1839                                 int insert_reserved)
1840 {
1841         int ret = 0;
1842         struct btrfs_delayed_tree_ref *ref;
1843         struct btrfs_key ins;
1844         u64 parent = 0;
1845         u64 ref_root = 0;
1846
1847         ins.objectid = node->bytenr;
1848         ins.offset = node->num_bytes;
1849         ins.type = BTRFS_EXTENT_ITEM_KEY;
1850
1851         ref = btrfs_delayed_node_to_tree_ref(node);
1852         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1853                 parent = ref->parent;
1854         else
1855                 ref_root = ref->root;
1856
1857         BUG_ON(node->ref_mod != 1);
1858         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1859                 BUG_ON(!extent_op || !extent_op->update_flags ||
1860                        !extent_op->update_key);
1861                 ret = alloc_reserved_tree_block(trans, root,
1862                                                 parent, ref_root,
1863                                                 extent_op->flags_to_set,
1864                                                 &extent_op->key,
1865                                                 ref->level, &ins);
1866         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1867                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1868                                              node->num_bytes, parent, ref_root,
1869                                              ref->level, 0, 1, extent_op);
1870         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1871                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1872                                           node->num_bytes, parent, ref_root,
1873                                           ref->level, 0, 1, extent_op);
1874         } else {
1875                 BUG();
1876         }
1877         return ret;
1878 }
1879
1880 /* helper function to actually process a single delayed ref entry */
1881 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1882                                struct btrfs_root *root,
1883                                struct btrfs_delayed_ref_node *node,
1884                                struct btrfs_delayed_extent_op *extent_op,
1885                                int insert_reserved)
1886 {
1887         int ret;
1888         if (btrfs_delayed_ref_is_head(node)) {
1889                 struct btrfs_delayed_ref_head *head;
1890                 /*
1891                  * we've hit the end of the chain and we were supposed
1892                  * to insert this extent into the tree.  But, it got
1893                  * deleted before we ever needed to insert it, so all
1894                  * we have to do is clean up the accounting
1895                  */
1896                 BUG_ON(extent_op);
1897                 head = btrfs_delayed_node_to_head(node);
1898                 if (insert_reserved) {
1899                         btrfs_pin_extent(root, node->bytenr,
1900                                          node->num_bytes, 1);
1901                         if (head->is_data) {
1902                                 ret = btrfs_del_csums(trans, root,
1903                                                       node->bytenr,
1904                                                       node->num_bytes);
1905                                 BUG_ON(ret);
1906                         }
1907                 }
1908                 mutex_unlock(&head->mutex);
1909                 return 0;
1910         }
1911
1912         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1913             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1914                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
1915                                            insert_reserved);
1916         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1917                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1918                 ret = run_delayed_data_ref(trans, root, node, extent_op,
1919                                            insert_reserved);
1920         else
1921                 BUG();
1922         return ret;
1923 }
1924
1925 static noinline struct btrfs_delayed_ref_node *
1926 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1927 {
1928         struct rb_node *node;
1929         struct btrfs_delayed_ref_node *ref;
1930         int action = BTRFS_ADD_DELAYED_REF;
1931 again:
1932         /*
1933          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1934          * this prevents ref count from going down to zero when
1935          * there still are pending delayed ref.
1936          */
1937         node = rb_prev(&head->node.rb_node);
1938         while (1) {
1939                 if (!node)
1940                         break;
1941                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1942                                 rb_node);
1943                 if (ref->bytenr != head->node.bytenr)
1944                         break;
1945                 if (ref->action == action)
1946                         return ref;
1947                 node = rb_prev(node);
1948         }
1949         if (action == BTRFS_ADD_DELAYED_REF) {
1950                 action = BTRFS_DROP_DELAYED_REF;
1951                 goto again;
1952         }
1953         return NULL;
1954 }
1955
1956 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1957                                        struct btrfs_root *root,
1958                                        struct list_head *cluster)
1959 {
1960         struct btrfs_delayed_ref_root *delayed_refs;
1961         struct btrfs_delayed_ref_node *ref;
1962         struct btrfs_delayed_ref_head *locked_ref = NULL;
1963         struct btrfs_delayed_extent_op *extent_op;
1964         int ret;
1965         int count = 0;
1966         int must_insert_reserved = 0;
1967
1968         delayed_refs = &trans->transaction->delayed_refs;
1969         while (1) {
1970                 if (!locked_ref) {
1971                         /* pick a new head ref from the cluster list */
1972                         if (list_empty(cluster))
1973                                 break;
1974
1975                         locked_ref = list_entry(cluster->next,
1976                                      struct btrfs_delayed_ref_head, cluster);
1977
1978                         /* grab the lock that says we are going to process
1979                          * all the refs for this head */
1980                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
1981
1982                         /*
1983                          * we may have dropped the spin lock to get the head
1984                          * mutex lock, and that might have given someone else
1985                          * time to free the head.  If that's true, it has been
1986                          * removed from our list and we can move on.
1987                          */
1988                         if (ret == -EAGAIN) {
1989                                 locked_ref = NULL;
1990                                 count++;
1991                                 continue;
1992                         }
1993                 }
1994
1995                 /*
1996                  * record the must insert reserved flag before we
1997                  * drop the spin lock.
1998                  */
1999                 must_insert_reserved = locked_ref->must_insert_reserved;
2000                 locked_ref->must_insert_reserved = 0;
2001
2002                 extent_op = locked_ref->extent_op;
2003                 locked_ref->extent_op = NULL;
2004
2005                 /*
2006                  * locked_ref is the head node, so we have to go one
2007                  * node back for any delayed ref updates
2008                  */
2009                 ref = select_delayed_ref(locked_ref);
2010                 if (!ref) {
2011                         /* All delayed refs have been processed, Go ahead
2012                          * and send the head node to run_one_delayed_ref,
2013                          * so that any accounting fixes can happen
2014                          */
2015                         ref = &locked_ref->node;
2016
2017                         if (extent_op && must_insert_reserved) {
2018                                 kfree(extent_op);
2019                                 extent_op = NULL;
2020                         }
2021
2022                         if (extent_op) {
2023                                 spin_unlock(&delayed_refs->lock);
2024
2025                                 ret = run_delayed_extent_op(trans, root,
2026                                                             ref, extent_op);
2027                                 BUG_ON(ret);
2028                                 kfree(extent_op);
2029
2030                                 cond_resched();
2031                                 spin_lock(&delayed_refs->lock);
2032                                 continue;
2033                         }
2034
2035                         list_del_init(&locked_ref->cluster);
2036                         locked_ref = NULL;
2037                 }
2038
2039                 ref->in_tree = 0;
2040                 rb_erase(&ref->rb_node, &delayed_refs->root);
2041                 delayed_refs->num_entries--;
2042
2043                 spin_unlock(&delayed_refs->lock);
2044
2045                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2046                                           must_insert_reserved);
2047                 BUG_ON(ret);
2048
2049                 btrfs_put_delayed_ref(ref);
2050                 kfree(extent_op);
2051                 count++;
2052
2053                 cond_resched();
2054                 spin_lock(&delayed_refs->lock);
2055         }
2056         return count;
2057 }
2058
2059 /*
2060  * this starts processing the delayed reference count updates and
2061  * extent insertions we have queued up so far.  count can be
2062  * 0, which means to process everything in the tree at the start
2063  * of the run (but not newly added entries), or it can be some target
2064  * number you'd like to process.
2065  */
2066 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2067                            struct btrfs_root *root, unsigned long count)
2068 {
2069         struct rb_node *node;
2070         struct btrfs_delayed_ref_root *delayed_refs;
2071         struct btrfs_delayed_ref_node *ref;
2072         struct list_head cluster;
2073         int ret;
2074         int run_all = count == (unsigned long)-1;
2075         int run_most = 0;
2076
2077         if (root == root->fs_info->extent_root)
2078                 root = root->fs_info->tree_root;
2079
2080         delayed_refs = &trans->transaction->delayed_refs;
2081         INIT_LIST_HEAD(&cluster);
2082 again:
2083         spin_lock(&delayed_refs->lock);
2084         if (count == 0) {
2085                 count = delayed_refs->num_entries * 2;
2086                 run_most = 1;
2087         }
2088         while (1) {
2089                 if (!(run_all || run_most) &&
2090                     delayed_refs->num_heads_ready < 64)
2091                         break;
2092
2093                 /*
2094                  * go find something we can process in the rbtree.  We start at
2095                  * the beginning of the tree, and then build a cluster
2096                  * of refs to process starting at the first one we are able to
2097                  * lock
2098                  */
2099                 ret = btrfs_find_ref_cluster(trans, &cluster,
2100                                              delayed_refs->run_delayed_start);
2101                 if (ret)
2102                         break;
2103
2104                 ret = run_clustered_refs(trans, root, &cluster);
2105                 BUG_ON(ret < 0);
2106
2107                 count -= min_t(unsigned long, ret, count);
2108
2109                 if (count == 0)
2110                         break;
2111         }
2112
2113         if (run_all) {
2114                 node = rb_first(&delayed_refs->root);
2115                 if (!node)
2116                         goto out;
2117                 count = (unsigned long)-1;
2118
2119                 while (node) {
2120                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2121                                        rb_node);
2122                         if (btrfs_delayed_ref_is_head(ref)) {
2123                                 struct btrfs_delayed_ref_head *head;
2124
2125                                 head = btrfs_delayed_node_to_head(ref);
2126                                 atomic_inc(&ref->refs);
2127
2128                                 spin_unlock(&delayed_refs->lock);
2129                                 mutex_lock(&head->mutex);
2130                                 mutex_unlock(&head->mutex);
2131
2132                                 btrfs_put_delayed_ref(ref);
2133                                 cond_resched();
2134                                 goto again;
2135                         }
2136                         node = rb_next(node);
2137                 }
2138                 spin_unlock(&delayed_refs->lock);
2139                 schedule_timeout(1);
2140                 goto again;
2141         }
2142 out:
2143         spin_unlock(&delayed_refs->lock);
2144         return 0;
2145 }
2146
2147 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2148                                 struct btrfs_root *root,
2149                                 u64 bytenr, u64 num_bytes, u64 flags,
2150                                 int is_data)
2151 {
2152         struct btrfs_delayed_extent_op *extent_op;
2153         int ret;
2154
2155         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2156         if (!extent_op)
2157                 return -ENOMEM;
2158
2159         extent_op->flags_to_set = flags;
2160         extent_op->update_flags = 1;
2161         extent_op->update_key = 0;
2162         extent_op->is_data = is_data ? 1 : 0;
2163
2164         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2165         if (ret)
2166                 kfree(extent_op);
2167         return ret;
2168 }
2169
2170 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2171                                       struct btrfs_root *root,
2172                                       struct btrfs_path *path,
2173                                       u64 objectid, u64 offset, u64 bytenr)
2174 {
2175         struct btrfs_delayed_ref_head *head;
2176         struct btrfs_delayed_ref_node *ref;
2177         struct btrfs_delayed_data_ref *data_ref;
2178         struct btrfs_delayed_ref_root *delayed_refs;
2179         struct rb_node *node;
2180         int ret = 0;
2181
2182         ret = -ENOENT;
2183         delayed_refs = &trans->transaction->delayed_refs;
2184         spin_lock(&delayed_refs->lock);
2185         head = btrfs_find_delayed_ref_head(trans, bytenr);
2186         if (!head)
2187                 goto out;
2188
2189         if (!mutex_trylock(&head->mutex)) {
2190                 atomic_inc(&head->node.refs);
2191                 spin_unlock(&delayed_refs->lock);
2192
2193                 btrfs_release_path(root->fs_info->extent_root, path);
2194
2195                 mutex_lock(&head->mutex);
2196                 mutex_unlock(&head->mutex);
2197                 btrfs_put_delayed_ref(&head->node);
2198                 return -EAGAIN;
2199         }
2200
2201         node = rb_prev(&head->node.rb_node);
2202         if (!node)
2203                 goto out_unlock;
2204
2205         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2206
2207         if (ref->bytenr != bytenr)
2208                 goto out_unlock;
2209
2210         ret = 1;
2211         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2212                 goto out_unlock;
2213
2214         data_ref = btrfs_delayed_node_to_data_ref(ref);
2215
2216         node = rb_prev(node);
2217         if (node) {
2218                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2219                 if (ref->bytenr == bytenr)
2220                         goto out_unlock;
2221         }
2222
2223         if (data_ref->root != root->root_key.objectid ||
2224             data_ref->objectid != objectid || data_ref->offset != offset)
2225                 goto out_unlock;
2226
2227         ret = 0;
2228 out_unlock:
2229         mutex_unlock(&head->mutex);
2230 out:
2231         spin_unlock(&delayed_refs->lock);
2232         return ret;
2233 }
2234
2235 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2236                                         struct btrfs_root *root,
2237                                         struct btrfs_path *path,
2238                                         u64 objectid, u64 offset, u64 bytenr)
2239 {
2240         struct btrfs_root *extent_root = root->fs_info->extent_root;
2241         struct extent_buffer *leaf;
2242         struct btrfs_extent_data_ref *ref;
2243         struct btrfs_extent_inline_ref *iref;
2244         struct btrfs_extent_item *ei;
2245         struct btrfs_key key;
2246         u32 item_size;
2247         int ret;
2248
2249         key.objectid = bytenr;
2250         key.offset = (u64)-1;
2251         key.type = BTRFS_EXTENT_ITEM_KEY;
2252
2253         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2254         if (ret < 0)
2255                 goto out;
2256         BUG_ON(ret == 0);
2257
2258         ret = -ENOENT;
2259         if (path->slots[0] == 0)
2260                 goto out;
2261
2262         path->slots[0]--;
2263         leaf = path->nodes[0];
2264         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2265
2266         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2267                 goto out;
2268
2269         ret = 1;
2270         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2271 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2272         if (item_size < sizeof(*ei)) {
2273                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2274                 goto out;
2275         }
2276 #endif
2277         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2278
2279         if (item_size != sizeof(*ei) +
2280             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2281                 goto out;
2282
2283         if (btrfs_extent_generation(leaf, ei) <=
2284             btrfs_root_last_snapshot(&root->root_item))
2285                 goto out;
2286
2287         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2288         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2289             BTRFS_EXTENT_DATA_REF_KEY)
2290                 goto out;
2291
2292         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2293         if (btrfs_extent_refs(leaf, ei) !=
2294             btrfs_extent_data_ref_count(leaf, ref) ||
2295             btrfs_extent_data_ref_root(leaf, ref) !=
2296             root->root_key.objectid ||
2297             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2298             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2299                 goto out;
2300
2301         ret = 0;
2302 out:
2303         return ret;
2304 }
2305
2306 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2307                           struct btrfs_root *root,
2308                           u64 objectid, u64 offset, u64 bytenr)
2309 {
2310         struct btrfs_path *path;
2311         int ret;
2312         int ret2;
2313
2314         path = btrfs_alloc_path();
2315         if (!path)
2316                 return -ENOENT;
2317
2318         do {
2319                 ret = check_committed_ref(trans, root, path, objectid,
2320                                           offset, bytenr);
2321                 if (ret && ret != -ENOENT)
2322                         goto out;
2323
2324                 ret2 = check_delayed_ref(trans, root, path, objectid,
2325                                          offset, bytenr);
2326         } while (ret2 == -EAGAIN);
2327
2328         if (ret2 && ret2 != -ENOENT) {
2329                 ret = ret2;
2330                 goto out;
2331         }
2332
2333         if (ret != -ENOENT || ret2 != -ENOENT)
2334                 ret = 0;
2335 out:
2336         btrfs_free_path(path);
2337         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2338                 WARN_ON(ret > 0);
2339         return ret;
2340 }
2341
2342 #if 0
2343 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2344                     struct extent_buffer *buf, u32 nr_extents)
2345 {
2346         struct btrfs_key key;
2347         struct btrfs_file_extent_item *fi;
2348         u64 root_gen;
2349         u32 nritems;
2350         int i;
2351         int level;
2352         int ret = 0;
2353         int shared = 0;
2354
2355         if (!root->ref_cows)
2356                 return 0;
2357
2358         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2359                 shared = 0;
2360                 root_gen = root->root_key.offset;
2361         } else {
2362                 shared = 1;
2363                 root_gen = trans->transid - 1;
2364         }
2365
2366         level = btrfs_header_level(buf);
2367         nritems = btrfs_header_nritems(buf);
2368
2369         if (level == 0) {
2370                 struct btrfs_leaf_ref *ref;
2371                 struct btrfs_extent_info *info;
2372
2373                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2374                 if (!ref) {
2375                         ret = -ENOMEM;
2376                         goto out;
2377                 }
2378
2379                 ref->root_gen = root_gen;
2380                 ref->bytenr = buf->start;
2381                 ref->owner = btrfs_header_owner(buf);
2382                 ref->generation = btrfs_header_generation(buf);
2383                 ref->nritems = nr_extents;
2384                 info = ref->extents;
2385
2386                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2387                         u64 disk_bytenr;
2388                         btrfs_item_key_to_cpu(buf, &key, i);
2389                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2390                                 continue;
2391                         fi = btrfs_item_ptr(buf, i,
2392                                             struct btrfs_file_extent_item);
2393                         if (btrfs_file_extent_type(buf, fi) ==
2394                             BTRFS_FILE_EXTENT_INLINE)
2395                                 continue;
2396                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2397                         if (disk_bytenr == 0)
2398                                 continue;
2399
2400                         info->bytenr = disk_bytenr;
2401                         info->num_bytes =
2402                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2403                         info->objectid = key.objectid;
2404                         info->offset = key.offset;
2405                         info++;
2406                 }
2407
2408                 ret = btrfs_add_leaf_ref(root, ref, shared);
2409                 if (ret == -EEXIST && shared) {
2410                         struct btrfs_leaf_ref *old;
2411                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2412                         BUG_ON(!old);
2413                         btrfs_remove_leaf_ref(root, old);
2414                         btrfs_free_leaf_ref(root, old);
2415                         ret = btrfs_add_leaf_ref(root, ref, shared);
2416                 }
2417                 WARN_ON(ret);
2418                 btrfs_free_leaf_ref(root, ref);
2419         }
2420 out:
2421         return ret;
2422 }
2423
2424 /* when a block goes through cow, we update the reference counts of
2425  * everything that block points to.  The internal pointers of the block
2426  * can be in just about any order, and it is likely to have clusters of
2427  * things that are close together and clusters of things that are not.
2428  *
2429  * To help reduce the seeks that come with updating all of these reference
2430  * counts, sort them by byte number before actual updates are done.
2431  *
2432  * struct refsort is used to match byte number to slot in the btree block.
2433  * we sort based on the byte number and then use the slot to actually
2434  * find the item.
2435  *
2436  * struct refsort is smaller than strcut btrfs_item and smaller than
2437  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2438  * for a btree block, there's no way for a kmalloc of refsorts for a
2439  * single node to be bigger than a page.
2440  */
2441 struct refsort {
2442         u64 bytenr;
2443         u32 slot;
2444 };
2445
2446 /*
2447  * for passing into sort()
2448  */
2449 static int refsort_cmp(const void *a_void, const void *b_void)
2450 {
2451         const struct refsort *a = a_void;
2452         const struct refsort *b = b_void;
2453
2454         if (a->bytenr < b->bytenr)
2455                 return -1;
2456         if (a->bytenr > b->bytenr)
2457                 return 1;
2458         return 0;
2459 }
2460 #endif
2461
2462 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2463                            struct btrfs_root *root,
2464                            struct extent_buffer *buf,
2465                            int full_backref, int inc)
2466 {
2467         u64 bytenr;
2468         u64 num_bytes;
2469         u64 parent;
2470         u64 ref_root;
2471         u32 nritems;
2472         struct btrfs_key key;
2473         struct btrfs_file_extent_item *fi;
2474         int i;
2475         int level;
2476         int ret = 0;
2477         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2478                             u64, u64, u64, u64, u64, u64);
2479
2480         ref_root = btrfs_header_owner(buf);
2481         nritems = btrfs_header_nritems(buf);
2482         level = btrfs_header_level(buf);
2483
2484         if (!root->ref_cows && level == 0)
2485                 return 0;
2486
2487         if (inc)
2488                 process_func = btrfs_inc_extent_ref;
2489         else
2490                 process_func = btrfs_free_extent;
2491
2492         if (full_backref)
2493                 parent = buf->start;
2494         else
2495                 parent = 0;
2496
2497         for (i = 0; i < nritems; i++) {
2498                 if (level == 0) {
2499                         btrfs_item_key_to_cpu(buf, &key, i);
2500                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2501                                 continue;
2502                         fi = btrfs_item_ptr(buf, i,
2503                                             struct btrfs_file_extent_item);
2504                         if (btrfs_file_extent_type(buf, fi) ==
2505                             BTRFS_FILE_EXTENT_INLINE)
2506                                 continue;
2507                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2508                         if (bytenr == 0)
2509                                 continue;
2510
2511                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2512                         key.offset -= btrfs_file_extent_offset(buf, fi);
2513                         ret = process_func(trans, root, bytenr, num_bytes,
2514                                            parent, ref_root, key.objectid,
2515                                            key.offset);
2516                         if (ret)
2517                                 goto fail;
2518                 } else {
2519                         bytenr = btrfs_node_blockptr(buf, i);
2520                         num_bytes = btrfs_level_size(root, level - 1);
2521                         ret = process_func(trans, root, bytenr, num_bytes,
2522                                            parent, ref_root, level - 1, 0);
2523                         if (ret)
2524                                 goto fail;
2525                 }
2526         }
2527         return 0;
2528 fail:
2529         BUG();
2530         return ret;
2531 }
2532
2533 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2534                   struct extent_buffer *buf, int full_backref)
2535 {
2536         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2537 }
2538
2539 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2540                   struct extent_buffer *buf, int full_backref)
2541 {
2542         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2543 }
2544
2545 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2546                                  struct btrfs_root *root,
2547                                  struct btrfs_path *path,
2548                                  struct btrfs_block_group_cache *cache)
2549 {
2550         int ret;
2551         struct btrfs_root *extent_root = root->fs_info->extent_root;
2552         unsigned long bi;
2553         struct extent_buffer *leaf;
2554
2555         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2556         if (ret < 0)
2557                 goto fail;
2558         BUG_ON(ret);
2559
2560         leaf = path->nodes[0];
2561         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2562         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2563         btrfs_mark_buffer_dirty(leaf);
2564         btrfs_release_path(extent_root, path);
2565 fail:
2566         if (ret)
2567                 return ret;
2568         return 0;
2569
2570 }
2571
2572 static struct btrfs_block_group_cache *
2573 next_block_group(struct btrfs_root *root,
2574                  struct btrfs_block_group_cache *cache)
2575 {
2576         struct rb_node *node;
2577         spin_lock(&root->fs_info->block_group_cache_lock);
2578         node = rb_next(&cache->cache_node);
2579         btrfs_put_block_group(cache);
2580         if (node) {
2581                 cache = rb_entry(node, struct btrfs_block_group_cache,
2582                                  cache_node);
2583                 btrfs_get_block_group(cache);
2584         } else
2585                 cache = NULL;
2586         spin_unlock(&root->fs_info->block_group_cache_lock);
2587         return cache;
2588 }
2589
2590 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2591                                    struct btrfs_root *root)
2592 {
2593         struct btrfs_block_group_cache *cache;
2594         int err = 0;
2595         struct btrfs_path *path;
2596         u64 last = 0;
2597
2598         path = btrfs_alloc_path();
2599         if (!path)
2600                 return -ENOMEM;
2601
2602         while (1) {
2603                 if (last == 0) {
2604                         err = btrfs_run_delayed_refs(trans, root,
2605                                                      (unsigned long)-1);
2606                         BUG_ON(err);
2607                 }
2608
2609                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2610                 while (cache) {
2611                         if (cache->dirty)
2612                                 break;
2613                         cache = next_block_group(root, cache);
2614                 }
2615                 if (!cache) {
2616                         if (last == 0)
2617                                 break;
2618                         last = 0;
2619                         continue;
2620                 }
2621
2622                 cache->dirty = 0;
2623                 last = cache->key.objectid + cache->key.offset;
2624
2625                 err = write_one_cache_group(trans, root, path, cache);
2626                 BUG_ON(err);
2627                 btrfs_put_block_group(cache);
2628         }
2629
2630         btrfs_free_path(path);
2631         return 0;
2632 }
2633
2634 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2635 {
2636         struct btrfs_block_group_cache *block_group;
2637         int readonly = 0;
2638
2639         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2640         if (!block_group || block_group->ro)
2641                 readonly = 1;
2642         if (block_group)
2643                 btrfs_put_block_group(block_group);
2644         return readonly;
2645 }
2646
2647 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2648                              u64 total_bytes, u64 bytes_used,
2649                              struct btrfs_space_info **space_info)
2650 {
2651         struct btrfs_space_info *found;
2652         int i;
2653         int factor;
2654
2655         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2656                      BTRFS_BLOCK_GROUP_RAID10))
2657                 factor = 2;
2658         else
2659                 factor = 1;
2660
2661         found = __find_space_info(info, flags);
2662         if (found) {
2663                 spin_lock(&found->lock);
2664                 found->total_bytes += total_bytes;
2665                 found->bytes_used += bytes_used;
2666                 found->disk_used += bytes_used * factor;
2667                 found->full = 0;
2668                 spin_unlock(&found->lock);
2669                 *space_info = found;
2670                 return 0;
2671         }
2672         found = kzalloc(sizeof(*found), GFP_NOFS);
2673         if (!found)
2674                 return -ENOMEM;
2675
2676         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2677                 INIT_LIST_HEAD(&found->block_groups[i]);
2678         init_rwsem(&found->groups_sem);
2679         spin_lock_init(&found->lock);
2680         found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2681                                 BTRFS_BLOCK_GROUP_SYSTEM |
2682                                 BTRFS_BLOCK_GROUP_METADATA);
2683         found->total_bytes = total_bytes;
2684         found->bytes_used = bytes_used;
2685         found->disk_used = bytes_used * factor;
2686         found->bytes_pinned = 0;
2687         found->bytes_reserved = 0;
2688         found->bytes_readonly = 0;
2689         found->bytes_may_use = 0;
2690         found->full = 0;
2691         found->force_alloc = 0;
2692         *space_info = found;
2693         list_add_rcu(&found->list, &info->space_info);
2694         atomic_set(&found->caching_threads, 0);
2695         return 0;
2696 }
2697
2698 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2699 {
2700         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2701                                    BTRFS_BLOCK_GROUP_RAID1 |
2702                                    BTRFS_BLOCK_GROUP_RAID10 |
2703                                    BTRFS_BLOCK_GROUP_DUP);
2704         if (extra_flags) {
2705                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2706                         fs_info->avail_data_alloc_bits |= extra_flags;
2707                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2708                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2709                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2710                         fs_info->avail_system_alloc_bits |= extra_flags;
2711         }
2712 }
2713
2714 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2715 {
2716         u64 num_devices = root->fs_info->fs_devices->rw_devices;
2717
2718         if (num_devices == 1)
2719                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2720         if (num_devices < 4)
2721                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2722
2723         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2724             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2725                       BTRFS_BLOCK_GROUP_RAID10))) {
2726                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2727         }
2728
2729         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2730             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2731                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2732         }
2733
2734         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2735             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2736              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2737              (flags & BTRFS_BLOCK_GROUP_DUP)))
2738                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2739         return flags;
2740 }
2741
2742 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2743 {
2744         if (flags & BTRFS_BLOCK_GROUP_DATA)
2745                 flags |= root->fs_info->avail_data_alloc_bits &
2746                          root->fs_info->data_alloc_profile;
2747         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2748                 flags |= root->fs_info->avail_system_alloc_bits &
2749                          root->fs_info->system_alloc_profile;
2750         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2751                 flags |= root->fs_info->avail_metadata_alloc_bits &
2752                          root->fs_info->metadata_alloc_profile;
2753         return btrfs_reduce_alloc_profile(root, flags);
2754 }
2755
2756 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2757 {
2758         u64 flags;
2759
2760         if (data)
2761                 flags = BTRFS_BLOCK_GROUP_DATA;
2762         else if (root == root->fs_info->chunk_root)
2763                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2764         else
2765                 flags = BTRFS_BLOCK_GROUP_METADATA;
2766
2767         return get_alloc_profile(root, flags);
2768 }
2769
2770 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2771 {
2772         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2773                                                        BTRFS_BLOCK_GROUP_DATA);
2774 }
2775
2776 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2777 {
2778         u64 num_bytes;
2779         int level;
2780
2781         level = BTRFS_MAX_LEVEL - 2;
2782         /*
2783          * NOTE: these calculations are absolutely the worst possible case.
2784          * This assumes that _every_ item we insert will require a new leaf, and
2785          * that the tree has grown to its maximum level size.
2786          */
2787
2788         /*
2789          * for every item we insert we could insert both an extent item and a
2790          * extent ref item.  Then for ever item we insert, we will need to cow
2791          * both the original leaf, plus the leaf to the left and right of it.
2792          *
2793          * Unless we are talking about the extent root, then we just want the
2794          * number of items * 2, since we just need the extent item plus its ref.
2795          */
2796         if (root == root->fs_info->extent_root)
2797                 num_bytes = num_items * 2;
2798         else
2799                 num_bytes = (num_items + (2 * num_items)) * 3;
2800
2801         /*
2802          * num_bytes is total number of leaves we could need times the leaf
2803          * size, and then for every leaf we could end up cow'ing 2 nodes per
2804          * level, down to the leaf level.
2805          */
2806         num_bytes = (num_bytes * root->leafsize) +
2807                 (num_bytes * (level * 2)) * root->nodesize;
2808
2809         return num_bytes;
2810 }
2811
2812 /*
2813  * Unreserve metadata space for delalloc.  If we have less reserved credits than
2814  * we have extents, this function does nothing.
2815  */
2816 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2817                                           struct inode *inode, int num_items)
2818 {
2819         struct btrfs_fs_info *info = root->fs_info;
2820         struct btrfs_space_info *meta_sinfo;
2821         u64 num_bytes;
2822         u64 alloc_target;
2823         bool bug = false;
2824
2825         /* get the space info for where the metadata will live */
2826         alloc_target = btrfs_get_alloc_profile(root, 0);
2827         meta_sinfo = __find_space_info(info, alloc_target);
2828
2829         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2830                                            num_items);
2831
2832         spin_lock(&meta_sinfo->lock);
2833         spin_lock(&BTRFS_I(inode)->accounting_lock);
2834         if (BTRFS_I(inode)->reserved_extents <=
2835             BTRFS_I(inode)->outstanding_extents) {
2836                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2837                 spin_unlock(&meta_sinfo->lock);
2838                 return 0;
2839         }
2840         spin_unlock(&BTRFS_I(inode)->accounting_lock);
2841
2842         BTRFS_I(inode)->reserved_extents -= num_items;
2843         BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2844
2845         if (meta_sinfo->bytes_delalloc < num_bytes) {
2846                 bug = true;
2847                 meta_sinfo->bytes_delalloc = 0;
2848         } else {
2849                 meta_sinfo->bytes_delalloc -= num_bytes;
2850         }
2851         spin_unlock(&meta_sinfo->lock);
2852
2853         BUG_ON(bug);
2854
2855         return 0;
2856 }
2857
2858 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2859 {
2860         u64 thresh;
2861
2862         thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2863                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2864                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2865                 meta_sinfo->bytes_may_use;
2866
2867         thresh = meta_sinfo->total_bytes - thresh;
2868         thresh *= 80;
2869         do_div(thresh, 100);
2870         if (thresh <= meta_sinfo->bytes_delalloc)
2871                 meta_sinfo->force_delalloc = 1;
2872         else
2873                 meta_sinfo->force_delalloc = 0;
2874 }
2875
2876 /*
2877  * Reserve metadata space for delalloc.
2878  */
2879 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2880                                         struct inode *inode, int num_items)
2881 {
2882         struct btrfs_fs_info *info = root->fs_info;
2883         struct btrfs_space_info *meta_sinfo;
2884         u64 num_bytes;
2885         u64 used;
2886         u64 alloc_target;
2887         int flushed = 0;
2888         int force_delalloc;
2889
2890         /* get the space info for where the metadata will live */
2891         alloc_target = btrfs_get_alloc_profile(root, 0);
2892         meta_sinfo = __find_space_info(info, alloc_target);
2893
2894         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2895                                            num_items);
2896 again:
2897         spin_lock(&meta_sinfo->lock);
2898
2899         force_delalloc = meta_sinfo->force_delalloc;
2900
2901         if (unlikely(!meta_sinfo->bytes_root))
2902                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2903
2904         if (!flushed)
2905                 meta_sinfo->bytes_delalloc += num_bytes;
2906
2907         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2908                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2909                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2910                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2911
2912         if (used > meta_sinfo->total_bytes) {
2913                 flushed++;
2914
2915                 if (flushed == 1) {
2916                         if (maybe_allocate_chunk(NULL, root, meta_sinfo,
2917                                                  num_bytes))
2918                                 goto again;
2919                         flushed++;
2920                 } else {
2921                         spin_unlock(&meta_sinfo->lock);
2922                 }
2923
2924                 if (flushed == 2) {
2925                         filemap_flush(inode->i_mapping);
2926                         goto again;
2927                 } else if (flushed == 3) {
2928                         shrink_delalloc(NULL, root, meta_sinfo, num_bytes);
2929                         goto again;
2930                 }
2931                 spin_lock(&meta_sinfo->lock);
2932                 meta_sinfo->bytes_delalloc -= num_bytes;
2933                 spin_unlock(&meta_sinfo->lock);
2934                 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2935                        BTRFS_I(inode)->outstanding_extents,
2936                        BTRFS_I(inode)->reserved_extents);
2937                 dump_space_info(meta_sinfo, 0, 0);
2938                 return -ENOSPC;
2939         }
2940
2941         BTRFS_I(inode)->reserved_extents += num_items;
2942         check_force_delalloc(meta_sinfo);
2943         spin_unlock(&meta_sinfo->lock);
2944
2945         if (!flushed && force_delalloc)
2946                 filemap_flush(inode->i_mapping);
2947
2948         return 0;
2949 }
2950
2951 /*
2952  * unreserve num_items number of items worth of metadata space.  This needs to
2953  * be paired with btrfs_reserve_metadata_space.
2954  *
2955  * NOTE: if you have the option, run this _AFTER_ you do a
2956  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
2957  * oprations which will result in more used metadata, so we want to make sure we
2958  * can do that without issue.
2959  */
2960 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
2961 {
2962         struct btrfs_fs_info *info = root->fs_info;
2963         struct btrfs_space_info *meta_sinfo;
2964         u64 num_bytes;
2965         u64 alloc_target;
2966         bool bug = false;
2967
2968         /* get the space info for where the metadata will live */
2969         alloc_target = btrfs_get_alloc_profile(root, 0);
2970         meta_sinfo = __find_space_info(info, alloc_target);
2971
2972         num_bytes = calculate_bytes_needed(root, num_items);
2973
2974         spin_lock(&meta_sinfo->lock);
2975         if (meta_sinfo->bytes_may_use < num_bytes) {
2976                 bug = true;
2977                 meta_sinfo->bytes_may_use = 0;
2978         } else {
2979                 meta_sinfo->bytes_may_use -= num_bytes;
2980         }
2981         spin_unlock(&meta_sinfo->lock);
2982
2983         BUG_ON(bug);
2984
2985         return 0;
2986 }
2987
2988 /*
2989  * Reserve some metadata space for use.  We'll calculate the worste case number
2990  * of bytes that would be needed to modify num_items number of items.  If we
2991  * have space, fantastic, if not, you get -ENOSPC.  Please call
2992  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
2993  * items you reserved, since whatever metadata you needed should have already
2994  * been allocated.
2995  *
2996  * This will commit the transaction to make more space if we don't have enough
2997  * metadata space.  THe only time we don't do this is if we're reserving space
2998  * inside of a transaction, then we will just return -ENOSPC and it is the
2999  * callers responsibility to handle it properly.
3000  */
3001 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3002 {
3003         struct btrfs_fs_info *info = root->fs_info;
3004         struct btrfs_space_info *meta_sinfo;
3005         u64 num_bytes;
3006         u64 used;
3007         u64 alloc_target;
3008         int retries = 0;
3009
3010         /* get the space info for where the metadata will live */
3011         alloc_target = btrfs_get_alloc_profile(root, 0);
3012         meta_sinfo = __find_space_info(info, alloc_target);
3013
3014         num_bytes = calculate_bytes_needed(root, num_items);
3015 again:
3016         spin_lock(&meta_sinfo->lock);
3017
3018         if (unlikely(!meta_sinfo->bytes_root))
3019                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3020
3021         if (!retries)
3022                 meta_sinfo->bytes_may_use += num_bytes;
3023
3024         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3025                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3026                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3027                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3028
3029         if (used > meta_sinfo->total_bytes) {
3030                 retries++;
3031                 if (retries == 1) {
3032                         if (maybe_allocate_chunk(NULL, root, meta_sinfo,
3033                                                  num_bytes))
3034                                 goto again;
3035                         retries++;
3036                 } else {
3037                         spin_unlock(&meta_sinfo->lock);
3038                 }
3039
3040                 if (retries == 2) {
3041                         shrink_delalloc(NULL, root, meta_sinfo, num_bytes);
3042                         goto again;
3043                 }
3044                 spin_lock(&meta_sinfo->lock);
3045                 meta_sinfo->bytes_may_use -= num_bytes;
3046                 spin_unlock(&meta_sinfo->lock);
3047
3048                 dump_space_info(meta_sinfo, 0, 0);
3049                 return -ENOSPC;
3050         }
3051
3052         check_force_delalloc(meta_sinfo);
3053         spin_unlock(&meta_sinfo->lock);
3054
3055         return 0;
3056 }
3057
3058 /*
3059  * This will check the space that the inode allocates from to make sure we have
3060  * enough space for bytes.
3061  */
3062 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3063                                 u64 bytes)
3064 {
3065         struct btrfs_space_info *data_sinfo;
3066         u64 used;
3067         int ret = 0, committed = 0;
3068
3069         /* make sure bytes are sectorsize aligned */
3070         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3071
3072         data_sinfo = BTRFS_I(inode)->space_info;
3073         if (!data_sinfo)
3074                 goto alloc;
3075
3076 again:
3077         /* make sure we have enough space to handle the data first */
3078         spin_lock(&data_sinfo->lock);
3079         used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3080                 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3081                 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3082                 data_sinfo->bytes_super;
3083
3084         if (used + bytes > data_sinfo->total_bytes) {
3085                 struct btrfs_trans_handle *trans;
3086
3087                 /*
3088                  * if we don't have enough free bytes in this space then we need
3089                  * to alloc a new chunk.
3090                  */
3091                 if (!data_sinfo->full) {
3092                         u64 alloc_target;
3093
3094                         data_sinfo->force_alloc = 1;
3095                         spin_unlock(&data_sinfo->lock);
3096 alloc:
3097                         alloc_target = btrfs_get_alloc_profile(root, 1);
3098                         trans = btrfs_start_transaction(root, 1);
3099                         if (!trans)
3100                                 return -ENOMEM;
3101
3102                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3103                                              bytes + 2 * 1024 * 1024,
3104                                              alloc_target, 0);
3105                         btrfs_end_transaction(trans, root);
3106                         if (ret)
3107                                 return ret;
3108
3109                         if (!data_sinfo) {
3110                                 btrfs_set_inode_space_info(root, inode);
3111                                 data_sinfo = BTRFS_I(inode)->space_info;
3112                         }
3113                         goto again;
3114                 }
3115                 spin_unlock(&data_sinfo->lock);
3116
3117                 /* commit the current transaction and try again */
3118                 if (!committed && !root->fs_info->open_ioctl_trans) {
3119                         committed = 1;
3120                         trans = btrfs_join_transaction(root, 1);
3121                         if (!trans)
3122                                 return -ENOMEM;
3123                         ret = btrfs_commit_transaction(trans, root);
3124                         if (ret)
3125                                 return ret;
3126                         goto again;
3127                 }
3128
3129                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3130                        ", %llu bytes_used, %llu bytes_reserved, "
3131                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
3132                        "%llu total\n", (unsigned long long)bytes,
3133                        (unsigned long long)data_sinfo->bytes_delalloc,
3134                        (unsigned long long)data_sinfo->bytes_used,
3135                        (unsigned long long)data_sinfo->bytes_reserved,
3136                        (unsigned long long)data_sinfo->bytes_pinned,
3137                        (unsigned long long)data_sinfo->bytes_readonly,
3138                        (unsigned long long)data_sinfo->bytes_may_use,
3139                        (unsigned long long)data_sinfo->total_bytes);
3140                 return -ENOSPC;
3141         }
3142         data_sinfo->bytes_may_use += bytes;
3143         BTRFS_I(inode)->reserved_bytes += bytes;
3144         spin_unlock(&data_sinfo->lock);
3145
3146         return 0;
3147 }
3148
3149 /*
3150  * if there was an error for whatever reason after calling
3151  * btrfs_check_data_free_space, call this so we can cleanup the counters.
3152  */
3153 void btrfs_free_reserved_data_space(struct btrfs_root *root,
3154                                     struct inode *inode, u64 bytes)
3155 {
3156         struct btrfs_space_info *data_sinfo;
3157
3158         /* make sure bytes are sectorsize aligned */
3159         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3160
3161         data_sinfo = BTRFS_I(inode)->space_info;
3162         spin_lock(&data_sinfo->lock);
3163         data_sinfo->bytes_may_use -= bytes;
3164         BTRFS_I(inode)->reserved_bytes -= bytes;
3165         spin_unlock(&data_sinfo->lock);
3166 }
3167
3168 /* called when we are adding a delalloc extent to the inode's io_tree */
3169 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3170                                   u64 bytes)
3171 {
3172         struct btrfs_space_info *data_sinfo;
3173
3174         /* get the space info for where this inode will be storing its data */
3175         data_sinfo = BTRFS_I(inode)->space_info;
3176
3177         /* make sure we have enough space to handle the data first */
3178         spin_lock(&data_sinfo->lock);
3179         data_sinfo->bytes_delalloc += bytes;
3180
3181         /*
3182          * we are adding a delalloc extent without calling
3183          * btrfs_check_data_free_space first.  This happens on a weird
3184          * writepage condition, but shouldn't hurt our accounting
3185          */
3186         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3187                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3188                 BTRFS_I(inode)->reserved_bytes = 0;
3189         } else {
3190                 data_sinfo->bytes_may_use -= bytes;
3191                 BTRFS_I(inode)->reserved_bytes -= bytes;
3192         }
3193
3194         spin_unlock(&data_sinfo->lock);
3195 }
3196
3197 /* called when we are clearing an delalloc extent from the inode's io_tree */
3198 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3199                               u64 bytes)
3200 {
3201         struct btrfs_space_info *info;
3202
3203         info = BTRFS_I(inode)->space_info;
3204
3205         spin_lock(&info->lock);
3206         info->bytes_delalloc -= bytes;
3207         spin_unlock(&info->lock);
3208 }
3209
3210 static void force_metadata_allocation(struct btrfs_fs_info *info)
3211 {
3212         struct list_head *head = &info->space_info;
3213         struct btrfs_space_info *found;
3214
3215         rcu_read_lock();
3216         list_for_each_entry_rcu(found, head, list) {
3217                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3218                         found->force_alloc = 1;
3219         }
3220         rcu_read_unlock();
3221 }
3222
3223 static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3224                               u64 alloc_bytes)
3225 {
3226         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3227
3228         if (sinfo->bytes_used + sinfo->bytes_reserved +
3229             alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3230                 return 0;
3231
3232         if (sinfo->bytes_used + sinfo->bytes_reserved +
3233             alloc_bytes < div_factor(num_bytes, 8))
3234                 return 0;
3235
3236         return 1;
3237 }
3238
3239 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3240                           struct btrfs_root *extent_root, u64 alloc_bytes,
3241                           u64 flags, int force)
3242 {
3243         struct btrfs_space_info *space_info;
3244         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3245         int ret = 0;
3246
3247         mutex_lock(&fs_info->chunk_mutex);
3248
3249         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3250
3251         space_info = __find_space_info(extent_root->fs_info, flags);
3252         if (!space_info) {
3253                 ret = update_space_info(extent_root->fs_info, flags,
3254                                         0, 0, &space_info);
3255                 BUG_ON(ret);
3256         }
3257         BUG_ON(!space_info);
3258
3259         spin_lock(&space_info->lock);
3260         if (space_info->force_alloc)
3261                 force = 1;
3262         if (space_info->full) {
3263                 spin_unlock(&space_info->lock);
3264                 goto out;
3265         }
3266
3267         if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3268                 spin_unlock(&space_info->lock);
3269                 goto out;
3270         }
3271         spin_unlock(&space_info->lock);
3272
3273         /*
3274          * if we're doing a data chunk, go ahead and make sure that
3275          * we keep a reasonable number of metadata chunks allocated in the
3276          * FS as well.
3277          */
3278         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3279                 fs_info->data_chunk_allocations++;
3280                 if (!(fs_info->data_chunk_allocations %
3281                       fs_info->metadata_ratio))
3282                         force_metadata_allocation(fs_info);
3283         }
3284
3285         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3286         spin_lock(&space_info->lock);
3287         if (ret)
3288                 space_info->full = 1;
3289         else
3290                 ret = 1;
3291         space_info->force_alloc = 0;
3292         spin_unlock(&space_info->lock);
3293 out:
3294         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3295         return ret;
3296 }
3297
3298 static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3299                                 struct btrfs_root *root,
3300                                 struct btrfs_space_info *sinfo, u64 num_bytes)
3301 {
3302         int ret;
3303         int end_trans = 0;
3304
3305         if (sinfo->full)
3306                 return 0;
3307
3308         spin_lock(&sinfo->lock);
3309         ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3310         spin_unlock(&sinfo->lock);
3311         if (!ret)
3312                 return 0;
3313
3314         if (!trans) {
3315                 trans = btrfs_join_transaction(root, 1);
3316                 BUG_ON(IS_ERR(trans));
3317                 end_trans = 1;
3318         }
3319
3320         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3321                              num_bytes + 2 * 1024 * 1024,
3322                              get_alloc_profile(root, sinfo->flags), 0);
3323
3324         if (end_trans)
3325                 btrfs_end_transaction(trans, root);
3326
3327         return ret == 1 ? 1 : 0;
3328 }
3329
3330 /*
3331  * shrink metadata reservation for delalloc
3332  */
3333 static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334                            struct btrfs_root *root,
3335                            struct btrfs_space_info *sinfo, u64 to_reclaim)
3336 {
3337         u64 reserved;
3338         u64 max_reclaim;
3339         u64 reclaimed = 0;
3340         int pause = 1;
3341         int ret;
3342
3343         spin_lock(&sinfo->lock);
3344         reserved = sinfo->bytes_delalloc;
3345         spin_unlock(&sinfo->lock);
3346
3347         if (reserved == 0)
3348                 return 0;
3349
3350         max_reclaim = min(reserved, to_reclaim);
3351
3352         while (1) {
3353                 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3354                 if (!ret) {
3355                         __set_current_state(TASK_INTERRUPTIBLE);
3356                         schedule_timeout(pause);
3357                         pause <<= 1;
3358                         if (pause > HZ / 10)
3359                                 pause = HZ / 10;
3360                 } else {
3361                         pause = 1;
3362                 }
3363
3364                 spin_lock(&sinfo->lock);
3365                 if (reserved > sinfo->bytes_delalloc)
3366                         reclaimed = reserved - sinfo->bytes_delalloc;
3367                 reserved = sinfo->bytes_delalloc;
3368                 spin_unlock(&sinfo->lock);
3369
3370                 if (reserved == 0 || reclaimed >= max_reclaim)
3371                         break;
3372
3373                 if (trans && trans->transaction->blocked)
3374                         return -EAGAIN;
3375         }
3376         return reclaimed >= to_reclaim;
3377 }
3378
3379 static int should_retry_reserve(struct btrfs_trans_handle *trans,
3380                                 struct btrfs_root *root,
3381                                 struct btrfs_block_rsv *block_rsv,
3382                                 u64 num_bytes, int *retries)
3383 {
3384         struct btrfs_space_info *space_info = block_rsv->space_info;
3385         int ret;
3386
3387         if ((*retries) > 2)
3388                 return -ENOSPC;
3389
3390         ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3391         if (ret)
3392                 return 1;
3393
3394         if (trans && trans->transaction->in_commit)
3395                 return -ENOSPC;
3396
3397         ret = shrink_delalloc(trans, root, space_info, num_bytes);
3398         if (ret)
3399                 return ret;
3400
3401         spin_lock(&space_info->lock);
3402         if (space_info->bytes_pinned < num_bytes)
3403                 ret = 1;
3404         spin_unlock(&space_info->lock);
3405         if (ret)
3406                 return -ENOSPC;
3407
3408         (*retries)++;
3409
3410         if (trans)
3411                 return -EAGAIN;
3412
3413         trans = btrfs_join_transaction(root, 1);
3414         BUG_ON(IS_ERR(trans));
3415         ret = btrfs_commit_transaction(trans, root);
3416         BUG_ON(ret);
3417
3418         return 1;
3419 }
3420
3421 static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3422                                   u64 num_bytes)
3423 {
3424         struct btrfs_space_info *space_info = block_rsv->space_info;
3425         u64 unused;
3426         int ret = -ENOSPC;
3427
3428         spin_lock(&space_info->lock);
3429         unused = space_info->bytes_used + space_info->bytes_reserved +
3430                  space_info->bytes_pinned + space_info->bytes_readonly;
3431
3432         if (unused < space_info->total_bytes)
3433                 unused = space_info->total_bytes - unused;
3434         else
3435                 unused = 0;
3436
3437         if (unused >= num_bytes) {
3438                 if (block_rsv->priority >= 10) {
3439                         space_info->bytes_reserved += num_bytes;
3440                         ret = 0;
3441                 } else {
3442                         if ((unused + block_rsv->reserved) *
3443                             block_rsv->priority >=
3444                             (num_bytes + block_rsv->reserved) * 10) {
3445                                 space_info->bytes_reserved += num_bytes;
3446                                 ret = 0;
3447                         }
3448                 }
3449         }
3450         spin_unlock(&space_info->lock);
3451
3452         return ret;
3453 }
3454
3455 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3456                                              struct btrfs_root *root)
3457 {
3458         struct btrfs_block_rsv *block_rsv;
3459         if (root->ref_cows)
3460                 block_rsv = trans->block_rsv;
3461         else
3462                 block_rsv = root->block_rsv;
3463
3464         if (!block_rsv)
3465                 block_rsv = &root->fs_info->empty_block_rsv;
3466
3467         return block_rsv;
3468 }
3469
3470 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3471                                u64 num_bytes)
3472 {
3473         int ret = -ENOSPC;
3474         spin_lock(&block_rsv->lock);
3475         if (block_rsv->reserved >= num_bytes) {
3476                 block_rsv->reserved -= num_bytes;
3477                 if (block_rsv->reserved < block_rsv->size)
3478                         block_rsv->full = 0;
3479                 ret = 0;
3480         }
3481         spin_unlock(&block_rsv->lock);
3482         return ret;
3483 }
3484
3485 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3486                                 u64 num_bytes, int update_size)
3487 {
3488         spin_lock(&block_rsv->lock);
3489         block_rsv->reserved += num_bytes;
3490         if (update_size)
3491                 block_rsv->size += num_bytes;
3492         else if (block_rsv->reserved >= block_rsv->size)
3493                 block_rsv->full = 1;
3494         spin_unlock(&block_rsv->lock);
3495 }
3496
3497 void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3498                              struct btrfs_block_rsv *dest, u64 num_bytes)
3499 {
3500         struct btrfs_space_info *space_info = block_rsv->space_info;
3501
3502         spin_lock(&block_rsv->lock);
3503         if (num_bytes == (u64)-1)
3504                 num_bytes = block_rsv->size;
3505         block_rsv->size -= num_bytes;
3506         if (block_rsv->reserved >= block_rsv->size) {
3507                 num_bytes = block_rsv->reserved - block_rsv->size;
3508                 block_rsv->reserved = block_rsv->size;
3509                 block_rsv->full = 1;
3510         } else {
3511                 num_bytes = 0;
3512         }
3513         spin_unlock(&block_rsv->lock);
3514
3515         if (num_bytes > 0) {
3516                 if (dest) {
3517                         block_rsv_add_bytes(dest, num_bytes, 0);
3518                 } else {
3519                         spin_lock(&space_info->lock);
3520                         space_info->bytes_reserved -= num_bytes;
3521                         spin_unlock(&space_info->lock);
3522                 }
3523         }
3524 }
3525
3526 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3527                                    struct btrfs_block_rsv *dst, u64 num_bytes)
3528 {
3529         int ret;
3530
3531         ret = block_rsv_use_bytes(src, num_bytes);
3532         if (ret)
3533                 return ret;
3534
3535         block_rsv_add_bytes(dst, num_bytes, 1);
3536         return 0;
3537 }
3538
3539 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3540 {
3541         memset(rsv, 0, sizeof(*rsv));
3542         spin_lock_init(&rsv->lock);
3543         atomic_set(&rsv->usage, 1);
3544         rsv->priority = 6;
3545         INIT_LIST_HEAD(&rsv->list);
3546 }
3547
3548 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3549 {
3550         struct btrfs_block_rsv *block_rsv;
3551         struct btrfs_fs_info *fs_info = root->fs_info;
3552         u64 alloc_target;
3553
3554         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3555         if (!block_rsv)
3556                 return NULL;
3557
3558         btrfs_init_block_rsv(block_rsv);
3559
3560         alloc_target = btrfs_get_alloc_profile(root, 0);
3561         block_rsv->space_info = __find_space_info(fs_info,
3562                                                   BTRFS_BLOCK_GROUP_METADATA);
3563
3564         return block_rsv;
3565 }
3566
3567 void btrfs_free_block_rsv(struct btrfs_root *root,
3568                           struct btrfs_block_rsv *rsv)
3569 {
3570         if (rsv && atomic_dec_and_test(&rsv->usage)) {
3571                 btrfs_block_rsv_release(root, rsv, (u64)-1);
3572                 if (!rsv->durable)
3573                         kfree(rsv);
3574         }
3575 }
3576
3577 /*
3578  * make the block_rsv struct be able to capture freed space.
3579  * the captured space will re-add to the the block_rsv struct
3580  * after transaction commit
3581  */
3582 void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3583                                  struct btrfs_block_rsv *block_rsv)
3584 {
3585         block_rsv->durable = 1;
3586         mutex_lock(&fs_info->durable_block_rsv_mutex);
3587         list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3588         mutex_unlock(&fs_info->durable_block_rsv_mutex);
3589 }
3590
3591 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3592                         struct btrfs_root *root,
3593                         struct btrfs_block_rsv *block_rsv,
3594                         u64 num_bytes, int *retries)
3595 {
3596         int ret;
3597
3598         if (num_bytes == 0)
3599                 return 0;
3600 again:
3601         ret = reserve_metadata_bytes(block_rsv, num_bytes);
3602         if (!ret) {
3603                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3604                 return 0;
3605         }
3606
3607         ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3608         if (ret > 0)
3609                 goto again;
3610
3611         return ret;
3612 }
3613
3614 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3615                           struct btrfs_root *root,
3616                           struct btrfs_block_rsv *block_rsv,
3617                           u64 min_reserved, int min_factor)
3618 {
3619         u64 num_bytes = 0;
3620         int commit_trans = 0;
3621         int ret = -ENOSPC;
3622
3623         if (!block_rsv)
3624                 return 0;
3625
3626         spin_lock(&block_rsv->lock);
3627         if (min_factor > 0)
3628                 num_bytes = div_factor(block_rsv->size, min_factor);
3629         if (min_reserved > num_bytes)
3630                 num_bytes = min_reserved;
3631
3632         if (block_rsv->reserved >= num_bytes) {
3633                 ret = 0;
3634         } else {
3635                 num_bytes -= block_rsv->reserved;
3636                 if (block_rsv->durable &&
3637                     block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3638                         commit_trans = 1;
3639         }
3640         spin_unlock(&block_rsv->lock);
3641         if (!ret)
3642                 return 0;
3643
3644         if (block_rsv->refill_used) {
3645                 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3646                 if (!ret) {
3647                         block_rsv_add_bytes(block_rsv, num_bytes, 0);
3648                         return 0;
3649                 }
3650         }
3651
3652         if (commit_trans) {
3653                 if (trans)
3654                         return -EAGAIN;
3655
3656                 trans = btrfs_join_transaction(root, 1);
3657                 BUG_ON(IS_ERR(trans));
3658                 ret = btrfs_commit_transaction(trans, root);
3659                 return 0;
3660         }
3661
3662         WARN_ON(1);
3663         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3664                 block_rsv->size, block_rsv->reserved,
3665                 block_rsv->freed[0], block_rsv->freed[1]);
3666
3667         return -ENOSPC;
3668 }
3669
3670 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3671                             struct btrfs_block_rsv *dst_rsv,
3672                             u64 num_bytes)
3673 {
3674         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3675 }
3676
3677 void btrfs_block_rsv_release(struct btrfs_root *root,
3678                              struct btrfs_block_rsv *block_rsv,
3679                              u64 num_bytes)
3680 {
3681         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3682         if (global_rsv->full || global_rsv == block_rsv ||
3683             block_rsv->space_info != global_rsv->space_info)
3684                 global_rsv = NULL;
3685         block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3686 }
3687
3688 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3689 {
3690         struct btrfs_space_info *space_info;
3691
3692         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3693         fs_info->chunk_block_rsv.space_info = space_info;
3694         fs_info->chunk_block_rsv.priority = 10;
3695
3696         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3697         fs_info->trans_block_rsv.space_info = space_info;
3698         fs_info->empty_block_rsv.space_info = space_info;
3699         fs_info->empty_block_rsv.priority = 10;
3700
3701         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3702 }
3703
3704 static int update_block_group(struct btrfs_trans_handle *trans,
3705                               struct btrfs_root *root,
3706                               u64 bytenr, u64 num_bytes, int alloc)
3707 {
3708         struct btrfs_block_group_cache *cache;
3709         struct btrfs_fs_info *info = root->fs_info;
3710         int factor;
3711         u64 total = num_bytes;
3712         u64 old_val;
3713         u64 byte_in_group;
3714
3715         /* block accounting for super block */
3716         spin_lock(&info->delalloc_lock);
3717         old_val = btrfs_super_bytes_used(&info->super_copy);
3718         if (alloc)
3719                 old_val += num_bytes;
3720         else
3721                 old_val -= num_bytes;
3722         btrfs_set_super_bytes_used(&info->super_copy, old_val);
3723         spin_unlock(&info->delalloc_lock);
3724
3725         while (total) {
3726                 cache = btrfs_lookup_block_group(info, bytenr);
3727                 if (!cache)
3728                         return -1;
3729                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3730                                     BTRFS_BLOCK_GROUP_RAID1 |
3731                                     BTRFS_BLOCK_GROUP_RAID10))
3732                         factor = 2;
3733                 else
3734                         factor = 1;
3735                 byte_in_group = bytenr - cache->key.objectid;
3736                 WARN_ON(byte_in_group > cache->key.offset);
3737
3738                 spin_lock(&cache->space_info->lock);
3739                 spin_lock(&cache->lock);
3740                 cache->dirty = 1;
3741                 old_val = btrfs_block_group_used(&cache->item);
3742                 num_bytes = min(total, cache->key.offset - byte_in_group);
3743                 if (alloc) {
3744                         old_val += num_bytes;
3745                         btrfs_set_block_group_used(&cache->item, old_val);
3746                         cache->reserved -= num_bytes;
3747                         cache->space_info->bytes_reserved -= num_bytes;
3748                         cache->space_info->bytes_used += num_bytes;
3749                         cache->space_info->disk_used += num_bytes * factor;
3750                         spin_unlock(&cache->lock);
3751                         spin_unlock(&cache->space_info->lock);
3752                 } else {
3753                         old_val -= num_bytes;
3754                         btrfs_set_block_group_used(&cache->item, old_val);
3755                         cache->pinned += num_bytes;
3756                         cache->space_info->bytes_pinned += num_bytes;
3757                         cache->space_info->bytes_used -= num_bytes;
3758                         cache->space_info->disk_used -= num_bytes * factor;
3759                         spin_unlock(&cache->lock);
3760                         spin_unlock(&cache->space_info->lock);
3761
3762                         set_extent_dirty(info->pinned_extents,
3763                                          bytenr, bytenr + num_bytes - 1,
3764                                          GFP_NOFS | __GFP_NOFAIL);
3765                 }
3766                 btrfs_put_block_group(cache);
3767                 total -= num_bytes;
3768                 bytenr += num_bytes;
3769         }
3770         return 0;
3771 }
3772
3773 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3774 {
3775         struct btrfs_block_group_cache *cache;
3776         u64 bytenr;
3777
3778         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3779         if (!cache)
3780                 return 0;
3781
3782         bytenr = cache->key.objectid;
3783         btrfs_put_block_group(cache);
3784
3785         return bytenr;
3786 }
3787
3788 static int pin_down_extent(struct btrfs_root *root,
3789                            struct btrfs_block_group_cache *cache,
3790                            u64 bytenr, u64 num_bytes, int reserved)
3791 {
3792         spin_lock(&cache->space_info->lock);
3793         spin_lock(&cache->lock);
3794         cache->pinned += num_bytes;
3795         cache->space_info->bytes_pinned += num_bytes;
3796         if (reserved) {
3797                 cache->reserved -= num_bytes;
3798                 cache->space_info->bytes_reserved -= num_bytes;
3799         }
3800         spin_unlock(&cache->lock);
3801         spin_unlock(&cache->space_info->lock);
3802
3803         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3804                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3805         return 0;
3806 }
3807
3808 /*
3809  * this function must be called within transaction
3810  */
3811 int btrfs_pin_extent(struct btrfs_root *root,
3812                      u64 bytenr, u64 num_bytes, int reserved)
3813 {
3814         struct btrfs_block_group_cache *cache;
3815
3816         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3817         BUG_ON(!cache);
3818
3819         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3820
3821         btrfs_put_block_group(cache);
3822         return 0;
3823 }
3824
3825 /*
3826  * update size of reserved extents. this function may return -EAGAIN
3827  * if 'reserve' is true or 'sinfo' is false.
3828  */
3829 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3830                                  u64 num_bytes, int reserve, int sinfo)
3831 {
3832         int ret = 0;
3833         if (sinfo) {
3834                 struct btrfs_space_info *space_info = cache->space_info;
3835                 spin_lock(&space_info->lock);
3836                 spin_lock(&cache->lock);
3837                 if (reserve) {
3838                         if (cache->ro) {
3839                                 ret = -EAGAIN;
3840                         } else {
3841                                 cache->reserved += num_bytes;
3842                                 space_info->bytes_reserved += num_bytes;
3843                         }
3844                 } else {
3845                         if (cache->ro)
3846                                 space_info->bytes_readonly += num_bytes;
3847                         cache->reserved -= num_bytes;
3848                         space_info->bytes_reserved -= num_bytes;
3849                 }
3850                 spin_unlock(&cache->lock);
3851                 spin_unlock(&space_info->lock);
3852         } else {
3853                 spin_lock(&cache->lock);
3854                 if (cache->ro) {
3855                         ret = -EAGAIN;
3856                 } else {
3857                         if (reserve)
3858                                 cache->reserved += num_bytes;
3859                         else
3860                                 cache->reserved -= num_bytes;
3861                 }
3862                 spin_unlock(&cache->lock);
3863         }
3864         return ret;
3865 }
3866
3867 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3868                                 struct btrfs_root *root)
3869 {
3870         struct btrfs_fs_info *fs_info = root->fs_info;
3871         struct btrfs_caching_control *next;
3872         struct btrfs_caching_control *caching_ctl;
3873         struct btrfs_block_group_cache *cache;
3874
3875         down_write(&fs_info->extent_commit_sem);
3876
3877         list_for_each_entry_safe(caching_ctl, next,
3878                                  &fs_info->caching_block_groups, list) {
3879                 cache = caching_ctl->block_group;
3880                 if (block_group_cache_done(cache)) {
3881                         cache->last_byte_to_unpin = (u64)-1;
3882                         list_del_init(&caching_ctl->list);
3883                         put_caching_control(caching_ctl);
3884                 } else {
3885                         cache->last_byte_to_unpin = caching_ctl->progress;
3886                 }
3887         }
3888
3889         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3890                 fs_info->pinned_extents = &fs_info->freed_extents[1];
3891         else
3892                 fs_info->pinned_extents = &fs_info->freed_extents[0];
3893
3894         up_write(&fs_info->extent_commit_sem);
3895         return 0;
3896 }
3897
3898 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3899 {
3900         struct btrfs_fs_info *fs_info = root->fs_info;
3901         struct btrfs_block_group_cache *cache = NULL;
3902         u64 len;
3903
3904         while (start <= end) {
3905                 if (!cache ||
3906                     start >= cache->key.objectid + cache->key.offset) {
3907                         if (cache)
3908                                 btrfs_put_block_group(cache);
3909                         cache = btrfs_lookup_block_group(fs_info, start);
3910                         BUG_ON(!cache);
3911                 }
3912
3913                 len = cache->key.objectid + cache->key.offset - start;
3914                 len = min(len, end + 1 - start);
3915
3916                 if (start < cache->last_byte_to_unpin) {
3917                         len = min(len, cache->last_byte_to_unpin - start);
3918                         btrfs_add_free_space(cache, start, len);
3919                 }
3920
3921                 start += len;
3922
3923                 spin_lock(&cache->space_info->lock);
3924                 spin_lock(&cache->lock);
3925                 cache->pinned -= len;
3926                 cache->space_info->bytes_pinned -= len;
3927                 if (cache->ro) {
3928                         cache->space_info->bytes_readonly += len;
3929                 } else if (cache->reserved_pinned > 0) {
3930                         len = min(len, cache->reserved_pinned);
3931                         cache->reserved_pinned -= len;
3932                         cache->space_info->bytes_reserved += len;
3933                 }
3934                 spin_unlock(&cache->lock);
3935                 spin_unlock(&cache->space_info->lock);
3936         }
3937
3938         if (cache)
3939                 btrfs_put_block_group(cache);
3940         return 0;
3941 }
3942
3943 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3944                                struct btrfs_root *root)
3945 {
3946         struct btrfs_fs_info *fs_info = root->fs_info;
3947         struct extent_io_tree *unpin;
3948         struct btrfs_block_rsv *block_rsv;
3949         struct btrfs_block_rsv *next_rsv;
3950         u64 start;
3951         u64 end;
3952         int idx;
3953         int ret;
3954
3955         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3956                 unpin = &fs_info->freed_extents[1];
3957         else
3958                 unpin = &fs_info->freed_extents[0];
3959
3960         while (1) {
3961                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3962                                             EXTENT_DIRTY);
3963                 if (ret)
3964                         break;
3965
3966                 ret = btrfs_discard_extent(root, start, end + 1 - start);
3967
3968                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3969                 unpin_extent_range(root, start, end);
3970                 cond_resched();
3971         }
3972
3973         mutex_lock(&fs_info->durable_block_rsv_mutex);
3974         list_for_each_entry_safe(block_rsv, next_rsv,
3975                                  &fs_info->durable_block_rsv_list, list) {
3976
3977                 idx = trans->transid & 0x1;
3978                 if (block_rsv->freed[idx] > 0) {
3979                         block_rsv_add_bytes(block_rsv,
3980                                             block_rsv->freed[idx], 0);
3981                         block_rsv->freed[idx] = 0;
3982                 }
3983                 if (atomic_read(&block_rsv->usage) == 0) {
3984                         btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3985
3986                         if (block_rsv->freed[0] == 0 &&
3987                             block_rsv->freed[1] == 0) {
3988                                 list_del_init(&block_rsv->list);
3989                                 kfree(block_rsv);
3990                         }
3991                 } else {
3992                         btrfs_block_rsv_release(root, block_rsv, 0);
3993                 }
3994         }
3995         mutex_unlock(&fs_info->durable_block_rsv_mutex);
3996
3997         return 0;
3998 }
3999
4000 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4001                                 struct btrfs_root *root,
4002                                 u64 bytenr, u64 num_bytes, u64 parent,
4003                                 u64 root_objectid, u64 owner_objectid,
4004                                 u64 owner_offset, int refs_to_drop,
4005                                 struct btrfs_delayed_extent_op *extent_op)
4006 {
4007         struct btrfs_key key;
4008         struct btrfs_path *path;
4009         struct btrfs_fs_info *info = root->fs_info;
4010         struct btrfs_root *extent_root = info->extent_root;
4011         struct extent_buffer *leaf;
4012         struct btrfs_extent_item *ei;
4013         struct btrfs_extent_inline_ref *iref;
4014         int ret;
4015         int is_data;
4016         int extent_slot = 0;
4017         int found_extent = 0;
4018         int num_to_del = 1;
4019         u32 item_size;
4020         u64 refs;
4021
4022         path = btrfs_alloc_path();
4023         if (!path)
4024                 return -ENOMEM;
4025
4026         path->reada = 1;
4027         path->leave_spinning = 1;
4028
4029         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4030         BUG_ON(!is_data && refs_to_drop != 1);
4031
4032         ret = lookup_extent_backref(trans, extent_root, path, &iref,
4033                                     bytenr, num_bytes, parent,
4034                                     root_objectid, owner_objectid,
4035                                     owner_offset);
4036         if (ret == 0) {
4037                 extent_slot = path->slots[0];
4038                 while (extent_slot >= 0) {
4039                         btrfs_item_key_to_cpu(path->nodes[0], &key,
4040                                               extent_slot);
4041                         if (key.objectid != bytenr)
4042                                 break;
4043                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4044                             key.offset == num_bytes) {
4045                                 found_extent = 1;
4046                                 break;
4047                         }
4048                         if (path->slots[0] - extent_slot > 5)
4049                                 break;
4050                         extent_slot--;
4051                 }
4052 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4053                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4054                 if (found_extent && item_size < sizeof(*ei))
4055                         found_extent = 0;
4056 #endif
4057                 if (!found_extent) {
4058                         BUG_ON(iref);
4059                         ret = remove_extent_backref(trans, extent_root, path,
4060                                                     NULL, refs_to_drop,
4061                                                     is_data);
4062                         BUG_ON(ret);
4063                         btrfs_release_path(extent_root, path);
4064                         path->leave_spinning = 1;
4065
4066                         key.objectid = bytenr;
4067                         key.type = BTRFS_EXTENT_ITEM_KEY;
4068                         key.offset = num_bytes;
4069
4070                         ret = btrfs_search_slot(trans, extent_root,
4071                                                 &key, path, -1, 1);
4072                         if (ret) {
4073                                 printk(KERN_ERR "umm, got %d back from search"
4074                                        ", was looking for %llu\n", ret,
4075                                        (unsigned long long)bytenr);
4076                                 btrfs_print_leaf(extent_root, path->nodes[0]);
4077                         }
4078                         BUG_ON(ret);
4079                         extent_slot = path->slots[0];
4080                 }
4081         } else {
4082                 btrfs_print_leaf(extent_root, path->nodes[0]);
4083                 WARN_ON(1);
4084                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4085                        "parent %llu root %llu  owner %llu offset %llu\n",
4086                        (unsigned long long)bytenr,
4087                        (unsigned long long)parent,
4088                        (unsigned long long)root_objectid,
4089                        (unsigned long long)owner_objectid,
4090                        (unsigned long long)owner_offset);
4091         }
4092
4093         leaf = path->nodes[0];
4094         item_size = btrfs_item_size_nr(leaf, extent_slot);
4095 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4096         if (item_size < sizeof(*ei)) {
4097                 BUG_ON(found_extent || extent_slot != path->slots[0]);
4098                 ret = convert_extent_item_v0(trans, extent_root, path,
4099                                              owner_objectid, 0);
4100                 BUG_ON(ret < 0);
4101
4102                 btrfs_release_path(extent_root, path);
4103                 path->leave_spinning = 1;
4104
4105                 key.objectid = bytenr;
4106                 key.type = BTRFS_EXTENT_ITEM_KEY;
4107                 key.offset = num_bytes;
4108
4109                 ret = btrfs_search_slot(trans, extent_root, &key, path,
4110                                         -1, 1);
4111                 if (ret) {
4112                         printk(KERN_ERR "umm, got %d back from search"
4113                                ", was looking for %llu\n", ret,
4114                                (unsigned long long)bytenr);
4115                         btrfs_print_leaf(extent_root, path->nodes[0]);
4116                 }
4117                 BUG_ON(ret);
4118                 extent_slot = path->slots[0];
4119                 leaf = path->nodes[0];
4120                 item_size = btrfs_item_size_nr(leaf, extent_slot);
4121         }
4122 #endif
4123         BUG_ON(item_size < sizeof(*ei));
4124         ei = btrfs_item_ptr(leaf, extent_slot,
4125                             struct btrfs_extent_item);
4126         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4127                 struct btrfs_tree_block_info *bi;
4128                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4129                 bi = (struct btrfs_tree_block_info *)(ei + 1);
4130                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4131         }
4132
4133         refs = btrfs_extent_refs(leaf, ei);
4134         BUG_ON(refs < refs_to_drop);
4135         refs -= refs_to_drop;
4136
4137         if (refs > 0) {
4138                 if (extent_op)
4139                         __run_delayed_extent_op(extent_op, leaf, ei);
4140                 /*
4141                  * In the case of inline back ref, reference count will
4142                  * be updated by remove_extent_backref
4143                  */
4144                 if (iref) {
4145                         BUG_ON(!found_extent);
4146                 } else {
4147                         btrfs_set_extent_refs(leaf, ei, refs);
4148                         btrfs_mark_buffer_dirty(leaf);
4149                 }
4150                 if (found_extent) {
4151                         ret = remove_extent_backref(trans, extent_root, path,
4152                                                     iref, refs_to_drop,
4153                                                     is_data);
4154                         BUG_ON(ret);
4155                 }
4156         } else {
4157                 if (found_extent) {
4158                         BUG_ON(is_data && refs_to_drop !=
4159                                extent_data_ref_count(root, path, iref));
4160                         if (iref) {
4161                                 BUG_ON(path->slots[0] != extent_slot);
4162                         } else {
4163                                 BUG_ON(path->slots[0] != extent_slot + 1);
4164                                 path->slots[0] = extent_slot;
4165                                 num_to_del = 2;
4166                         }
4167                 }
4168
4169                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4170                                       num_to_del);
4171                 BUG_ON(ret);
4172                 btrfs_release_path(extent_root, path);
4173
4174                 if (is_data) {
4175                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4176                         BUG_ON(ret);
4177                 } else {
4178                         invalidate_mapping_pages(info->btree_inode->i_mapping,
4179                              bytenr >> PAGE_CACHE_SHIFT,
4180                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4181                 }
4182
4183                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4184                 BUG_ON(ret);
4185         }
4186         btrfs_free_path(path);
4187         return ret;
4188 }
4189
4190 /*
4191  * when we free an block, it is possible (and likely) that we free the last
4192  * delayed ref for that extent as well.  This searches the delayed ref tree for
4193  * a given extent, and if there are no other delayed refs to be processed, it
4194  * removes it from the tree.
4195  */
4196 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4197                                       struct btrfs_root *root, u64 bytenr)
4198 {
4199         struct btrfs_delayed_ref_head *head;
4200         struct btrfs_delayed_ref_root *delayed_refs;
4201         struct btrfs_delayed_ref_node *ref;
4202         struct rb_node *node;
4203         int ret = 0;
4204
4205         delayed_refs = &trans->transaction->delayed_refs;
4206         spin_lock(&delayed_refs->lock);
4207         head = btrfs_find_delayed_ref_head(trans, bytenr);
4208         if (!head)
4209                 goto out;
4210
4211         node = rb_prev(&head->node.rb_node);
4212         if (!node)
4213                 goto out;
4214
4215         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4216
4217         /* there are still entries for this ref, we can't drop it */
4218         if (ref->bytenr == bytenr)
4219                 goto out;
4220
4221         if (head->extent_op) {
4222                 if (!head->must_insert_reserved)
4223                         goto out;
4224                 kfree(head->extent_op);
4225                 head->extent_op = NULL;
4226         }
4227
4228         /*
4229          * waiting for the lock here would deadlock.  If someone else has it
4230          * locked they are already in the process of dropping it anyway
4231          */
4232         if (!mutex_trylock(&head->mutex))
4233                 goto out;
4234
4235         /*
4236          * at this point we have a head with no other entries.  Go
4237          * ahead and process it.
4238          */
4239         head->node.in_tree = 0;
4240         rb_erase(&head->node.rb_node, &delayed_refs->root);
4241
4242         delayed_refs->num_entries--;
4243
4244         /*
4245          * we don't take a ref on the node because we're removing it from the
4246          * tree, so we just steal the ref the tree was holding.
4247          */
4248         delayed_refs->num_heads--;
4249         if (list_empty(&head->cluster))
4250                 delayed_refs->num_heads_ready--;
4251
4252         list_del_init(&head->cluster);
4253         spin_unlock(&delayed_refs->lock);
4254
4255         BUG_ON(head->extent_op);
4256         if (head->must_insert_reserved)
4257                 ret = 1;
4258
4259         mutex_unlock(&head->mutex);
4260         btrfs_put_delayed_ref(&head->node);
4261         return ret;
4262 out:
4263         spin_unlock(&delayed_refs->lock);
4264         return 0;
4265 }
4266
4267 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4268                            struct btrfs_root *root,
4269                            struct extent_buffer *buf,
4270                            u64 parent, int last_ref)
4271 {
4272         struct btrfs_block_rsv *block_rsv;
4273         struct btrfs_block_group_cache *cache = NULL;
4274         int ret;
4275
4276         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4277                 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4278                                                 parent, root->root_key.objectid,
4279                                                 btrfs_header_level(buf),
4280                                                 BTRFS_DROP_DELAYED_REF, NULL);
4281                 BUG_ON(ret);
4282         }
4283
4284         if (!last_ref)
4285                 return;
4286
4287         block_rsv = get_block_rsv(trans, root);
4288         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4289         BUG_ON(block_rsv->space_info != cache->space_info);
4290
4291         if (btrfs_header_generation(buf) == trans->transid) {
4292                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4293                         ret = check_ref_cleanup(trans, root, buf->start);
4294                         if (!ret)
4295                                 goto pin;
4296                 }
4297
4298                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4299                         pin_down_extent(root, cache, buf->start, buf->len, 1);
4300                         goto pin;
4301                 }
4302
4303                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4304
4305                 btrfs_add_free_space(cache, buf->start, buf->len);
4306                 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4307                 if (ret == -EAGAIN) {
4308                         /* block group became read-only */
4309                         update_reserved_bytes(cache, buf->len, 0, 1);
4310                         goto out;
4311                 }
4312
4313                 ret = 1;
4314                 spin_lock(&block_rsv->lock);
4315                 if (block_rsv->reserved < block_rsv->size) {
4316                         block_rsv->reserved += buf->len;
4317                         ret = 0;
4318                 }
4319                 spin_unlock(&block_rsv->lock);
4320
4321                 if (ret) {
4322                         spin_lock(&cache->space_info->lock);
4323                         cache->space_info->bytes_reserved -= buf->len;
4324                         spin_unlock(&cache->space_info->lock);
4325                 }
4326                 goto out;
4327         }
4328 pin:
4329         if (block_rsv->durable && !cache->ro) {
4330                 ret = 0;
4331                 spin_lock(&cache->lock);
4332                 if (!cache->ro) {
4333                         cache->reserved_pinned += buf->len;
4334                         ret = 1;
4335                 }
4336                 spin_unlock(&cache->lock);
4337
4338                 if (ret) {
4339                         spin_lock(&block_rsv->lock);
4340                         block_rsv->freed[trans->transid & 0x1] += buf->len;
4341                         spin_unlock(&block_rsv->lock);
4342                 }
4343         }
4344 out:
4345         btrfs_put_block_group(cache);
4346 }
4347
4348 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4349                       struct btrfs_root *root,
4350                       u64 bytenr, u64 num_bytes, u64 parent,
4351                       u64 root_objectid, u64 owner, u64 offset)
4352 {
4353         int ret;
4354
4355         /*
4356          * tree log blocks never actually go into the extent allocation
4357          * tree, just update pinning info and exit early.
4358          */
4359         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4360                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4361                 /* unlocks the pinned mutex */
4362                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4363                 ret = 0;
4364         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4365                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4366                                         parent, root_objectid, (int)owner,
4367                                         BTRFS_DROP_DELAYED_REF, NULL);
4368                 BUG_ON(ret);
4369         } else {
4370                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4371                                         parent, root_objectid, owner,
4372                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4373                 BUG_ON(ret);
4374         }
4375         return ret;
4376 }
4377
4378 static u64 stripe_align(struct btrfs_root *root, u64 val)
4379 {
4380         u64 mask = ((u64)root->stripesize - 1);
4381         u64 ret = (val + mask) & ~mask;
4382         return ret;
4383 }
4384
4385 /*
4386  * when we wait for progress in the block group caching, its because
4387  * our allocation attempt failed at least once.  So, we must sleep
4388  * and let some progress happen before we try again.
4389  *
4390  * This function will sleep at least once waiting for new free space to
4391  * show up, and then it will check the block group free space numbers
4392  * for our min num_bytes.  Another option is to have it go ahead
4393  * and look in the rbtree for a free extent of a given size, but this
4394  * is a good start.
4395  */
4396 static noinline int
4397 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4398                                 u64 num_bytes)
4399 {
4400         struct btrfs_caching_control *caching_ctl;
4401         DEFINE_WAIT(wait);
4402
4403         caching_ctl = get_caching_control(cache);
4404         if (!caching_ctl)
4405                 return 0;
4406
4407         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4408                    (cache->free_space >= num_bytes));
4409
4410         put_caching_control(caching_ctl);
4411         return 0;
4412 }
4413
4414 static noinline int
4415 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4416 {
4417         struct btrfs_caching_control *caching_ctl;
4418         DEFINE_WAIT(wait);
4419
4420         caching_ctl = get_caching_control(cache);
4421         if (!caching_ctl)
4422                 return 0;
4423
4424         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4425
4426         put_caching_control(caching_ctl);
4427         return 0;
4428 }
4429
4430 static int get_block_group_index(struct btrfs_block_group_cache *cache)
4431 {
4432         int index;
4433         if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4434                 index = 0;
4435         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4436                 index = 1;
4437         else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4438                 index = 2;
4439         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4440                 index = 3;
4441         else
4442                 index = 4;
4443         return index;
4444 }
4445
4446 enum btrfs_loop_type {
4447         LOOP_FIND_IDEAL = 0,
4448         LOOP_CACHING_NOWAIT = 1,
4449         LOOP_CACHING_WAIT = 2,
4450         LOOP_ALLOC_CHUNK = 3,
4451         LOOP_NO_EMPTY_SIZE = 4,
4452 };
4453
4454 /*
4455  * walks the btree of allocated extents and find a hole of a given size.
4456  * The key ins is changed to record the hole:
4457  * ins->objectid == block start
4458  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4459  * ins->offset == number of blocks
4460  * Any available blocks before search_start are skipped.
4461  */
4462 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4463                                      struct btrfs_root *orig_root,
4464                                      u64 num_bytes, u64 empty_size,
4465                                      u64 search_start, u64 search_end,
4466                                      u64 hint_byte, struct btrfs_key *ins,
4467                                      int data)
4468 {
4469         int ret = 0;
4470         struct btrfs_root *root = orig_root->fs_info->extent_root;
4471         struct btrfs_free_cluster *last_ptr = NULL;
4472         struct btrfs_block_group_cache *block_group = NULL;
4473         int empty_cluster = 2 * 1024 * 1024;
4474         int allowed_chunk_alloc = 0;
4475         int done_chunk_alloc = 0;
4476         struct btrfs_space_info *space_info;
4477         int last_ptr_loop = 0;
4478         int loop = 0;
4479         int index = 0;
4480         bool found_uncached_bg = false;
4481         bool failed_cluster_refill = false;
4482         bool failed_alloc = false;
4483         u64 ideal_cache_percent = 0;
4484         u64 ideal_cache_offset = 0;
4485
4486         WARN_ON(num_bytes < root->sectorsize);
4487         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4488         ins->objectid = 0;
4489         ins->offset = 0;
4490
4491         space_info = __find_space_info(root->fs_info, data);
4492         if (!space_info) {
4493                 printk(KERN_ERR "No space info for %d\n", data);
4494                 return -ENOSPC;
4495         }
4496
4497         if (orig_root->ref_cows || empty_size)
4498                 allowed_chunk_alloc = 1;
4499
4500         if (data & BTRFS_BLOCK_GROUP_METADATA) {
4501                 last_ptr = &root->fs_info->meta_alloc_cluster;
4502                 if (!btrfs_test_opt(root, SSD))
4503                         empty_cluster = 64 * 1024;
4504         }
4505
4506         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4507                 last_ptr = &root->fs_info->data_alloc_cluster;
4508         }
4509
4510         if (last_ptr) {
4511                 spin_lock(&last_ptr->lock);
4512                 if (last_ptr->block_group)
4513                         hint_byte = last_ptr->window_start;
4514                 spin_unlock(&last_ptr->lock);
4515         }
4516
4517         search_start = max(search_start, first_logical_byte(root, 0));
4518         search_start = max(search_start, hint_byte);
4519
4520         if (!last_ptr)
4521                 empty_cluster = 0;
4522
4523         if (search_start == hint_byte) {
4524 ideal_cache:
4525                 block_group = btrfs_lookup_block_group(root->fs_info,
4526                                                        search_start);
4527                 /*
4528                  * we don't want to use the block group if it doesn't match our
4529                  * allocation bits, or if its not cached.
4530                  *
4531                  * However if we are re-searching with an ideal block group
4532                  * picked out then we don't care that the block group is cached.
4533                  */
4534                 if (block_group && block_group_bits(block_group, data) &&
4535                     (block_group->cached != BTRFS_CACHE_NO ||
4536                      search_start == ideal_cache_offset)) {
4537                         down_read(&space_info->groups_sem);
4538                         if (list_empty(&block_group->list) ||
4539                             block_group->ro) {
4540                                 /*
4541                                  * someone is removing this block group,
4542                                  * we can't jump into the have_block_group
4543                                  * target because our list pointers are not
4544                                  * valid
4545                                  */
4546                                 btrfs_put_block_group(block_group);
4547                                 up_read(&space_info->groups_sem);
4548                         } else {
4549                                 index = get_block_group_index(block_group);
4550                                 goto have_block_group;
4551                         }
4552                 } else if (block_group) {
4553                         btrfs_put_block_group(block_group);
4554                 }
4555         }
4556 search:
4557         down_read(&space_info->groups_sem);
4558         list_for_each_entry(block_group, &space_info->block_groups[index],
4559                             list) {
4560                 u64 offset;
4561                 int cached;
4562
4563                 btrfs_get_block_group(block_group);
4564                 search_start = block_group->key.objectid;
4565
4566 have_block_group:
4567                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4568                         u64 free_percent;
4569
4570                         free_percent = btrfs_block_group_used(&block_group->item);
4571                         free_percent *= 100;
4572                         free_percent = div64_u64(free_percent,
4573                                                  block_group->key.offset);
4574                         free_percent = 100 - free_percent;
4575                         if (free_percent > ideal_cache_percent &&
4576                             likely(!block_group->ro)) {
4577                                 ideal_cache_offset = block_group->key.objectid;
4578                                 ideal_cache_percent = free_percent;
4579                         }
4580
4581                         /*
4582                          * We only want to start kthread caching if we are at
4583                          * the point where we will wait for caching to make
4584                          * progress, or if our ideal search is over and we've
4585                          * found somebody to start caching.
4586                          */
4587                         if (loop > LOOP_CACHING_NOWAIT ||
4588                             (loop > LOOP_FIND_IDEAL &&
4589                              atomic_read(&space_info->caching_threads) < 2)) {
4590                                 ret = cache_block_group(block_group);
4591                                 BUG_ON(ret);
4592                         }
4593                         found_uncached_bg = true;
4594
4595                         /*
4596                          * If loop is set for cached only, try the next block
4597                          * group.
4598                          */
4599                         if (loop == LOOP_FIND_IDEAL)
4600                                 goto loop;
4601                 }
4602
4603                 cached = block_group_cache_done(block_group);
4604                 if (unlikely(!cached))
4605                         found_uncached_bg = true;
4606
4607                 if (unlikely(block_group->ro))
4608                         goto loop;
4609
4610                 /*
4611                  * Ok we want to try and use the cluster allocator, so lets look
4612                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4613                  * have tried the cluster allocator plenty of times at this
4614                  * point and not have found anything, so we are likely way too
4615                  * fragmented for the clustering stuff to find anything, so lets
4616                  * just skip it and let the allocator find whatever block it can
4617                  * find
4618                  */
4619                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4620                         /*
4621                          * the refill lock keeps out other
4622                          * people trying to start a new cluster
4623                          */
4624                         spin_lock(&last_ptr->refill_lock);
4625                         if (last_ptr->block_group &&
4626                             (last_ptr->block_group->ro ||
4627                             !block_group_bits(last_ptr->block_group, data))) {
4628                                 offset = 0;
4629                                 goto refill_cluster;
4630                         }
4631
4632                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4633                                                  num_bytes, search_start);
4634                         if (offset) {
4635                                 /* we have a block, we're done */
4636                                 spin_unlock(&last_ptr->refill_lock);
4637                                 goto checks;
4638                         }
4639
4640                         spin_lock(&last_ptr->lock);
4641                         /*
4642                          * whoops, this cluster doesn't actually point to
4643                          * this block group.  Get a ref on the block
4644                          * group is does point to and try again
4645                          */
4646                         if (!last_ptr_loop && last_ptr->block_group &&
4647                             last_ptr->block_group != block_group) {
4648
4649                                 btrfs_put_block_group(block_group);
4650                                 block_group = last_ptr->block_group;
4651                                 btrfs_get_block_group(block_group);
4652                                 spin_unlock(&last_ptr->lock);
4653                                 spin_unlock(&last_ptr->refill_lock);
4654
4655                                 last_ptr_loop = 1;
4656                                 search_start = block_group->key.objectid;
4657                                 /*
4658                                  * we know this block group is properly
4659                                  * in the list because
4660                                  * btrfs_remove_block_group, drops the
4661                                  * cluster before it removes the block
4662                                  * group from the list
4663                                  */
4664                                 goto have_block_group;
4665                         }
4666                         spin_unlock(&last_ptr->lock);
4667 refill_cluster:
4668                         /*
4669                          * this cluster didn't work out, free it and
4670                          * start over
4671                          */
4672                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4673
4674                         last_ptr_loop = 0;
4675
4676                         /* allocate a cluster in this block group */
4677                         ret = btrfs_find_space_cluster(trans, root,
4678                                                block_group, last_ptr,
4679                                                offset, num_bytes,
4680                                                empty_cluster + empty_size);
4681                         if (ret == 0) {
4682                                 /*
4683                                  * now pull our allocation out of this
4684                                  * cluster
4685                                  */
4686                                 offset = btrfs_alloc_from_cluster(block_group,
4687                                                   last_ptr, num_bytes,
4688                                                   search_start);
4689                                 if (offset) {
4690                                         /* we found one, proceed */
4691                                         spin_unlock(&last_ptr->refill_lock);
4692                                         goto checks;
4693                                 }
4694                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
4695                                    && !failed_cluster_refill) {
4696                                 spin_unlock(&last_ptr->refill_lock);
4697
4698                                 failed_cluster_refill = true;
4699                                 wait_block_group_cache_progress(block_group,
4700                                        num_bytes + empty_cluster + empty_size);
4701                                 goto have_block_group;
4702                         }
4703
4704                         /*
4705                          * at this point we either didn't find a cluster
4706                          * or we weren't able to allocate a block from our
4707                          * cluster.  Free the cluster we've been trying
4708                          * to use, and go to the next block group
4709                          */
4710                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4711                         spin_unlock(&last_ptr->refill_lock);
4712                         goto loop;
4713                 }
4714
4715                 offset = btrfs_find_space_for_alloc(block_group, search_start,
4716                                                     num_bytes, empty_size);
4717                 /*
4718                  * If we didn't find a chunk, and we haven't failed on this
4719                  * block group before, and this block group is in the middle of
4720                  * caching and we are ok with waiting, then go ahead and wait
4721                  * for progress to be made, and set failed_alloc to true.
4722                  *
4723                  * If failed_alloc is true then we've already waited on this
4724                  * block group once and should move on to the next block group.
4725                  */
4726                 if (!offset && !failed_alloc && !cached &&
4727                     loop > LOOP_CACHING_NOWAIT) {
4728                         wait_block_group_cache_progress(block_group,
4729                                                 num_bytes + empty_size);
4730                         failed_alloc = true;
4731                         goto have_block_group;
4732                 } else if (!offset) {
4733                         goto loop;
4734                 }
4735 checks:
4736                 search_start = stripe_align(root, offset);
4737                 /* move on to the next group */
4738                 if (search_start + num_bytes >= search_end) {
4739                         btrfs_add_free_space(block_group, offset, num_bytes);
4740                         goto loop;
4741                 }
4742
4743                 /* move on to the next group */
4744                 if (search_start + num_bytes >
4745                     block_group->key.objectid + block_group->key.offset) {
4746                         btrfs_add_free_space(block_group, offset, num_bytes);
4747                         goto loop;
4748                 }
4749
4750                 ins->objectid = search_start;
4751                 ins->offset = num_bytes;
4752
4753                 if (offset < search_start)
4754                         btrfs_add_free_space(block_group, offset,
4755                                              search_start - offset);
4756                 BUG_ON(offset > search_start);
4757
4758                 ret = update_reserved_bytes(block_group, num_bytes, 1,
4759                                             (data & BTRFS_BLOCK_GROUP_DATA));
4760                 if (ret == -EAGAIN) {
4761                         btrfs_add_free_space(block_group, offset, num_bytes);
4762                         goto loop;
4763                 }
4764
4765                 /* we are all good, lets return */
4766                 ins->objectid = search_start;
4767                 ins->offset = num_bytes;
4768
4769                 if (offset < search_start)
4770                         btrfs_add_free_space(block_group, offset,
4771                                              search_start - offset);
4772                 BUG_ON(offset > search_start);
4773                 break;
4774 loop:
4775                 failed_cluster_refill = false;
4776                 failed_alloc = false;
4777                 BUG_ON(index != get_block_group_index(block_group));
4778                 btrfs_put_block_group(block_group);
4779         }
4780         up_read(&space_info->groups_sem);
4781
4782         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4783                 goto search;
4784
4785         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4786          *                      for them to make caching progress.  Also
4787          *                      determine the best possible bg to cache
4788          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4789          *                      caching kthreads as we move along
4790          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4791          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4792          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4793          *                      again
4794          */
4795         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4796             (found_uncached_bg || empty_size || empty_cluster ||
4797              allowed_chunk_alloc)) {
4798                 index = 0;
4799                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4800                         found_uncached_bg = false;
4801                         loop++;
4802                         if (!ideal_cache_percent &&
4803                             atomic_read(&space_info->caching_threads))
4804                                 goto search;
4805
4806                         /*
4807                          * 1 of the following 2 things have happened so far
4808                          *
4809                          * 1) We found an ideal block group for caching that
4810                          * is mostly full and will cache quickly, so we might
4811                          * as well wait for it.
4812                          *
4813                          * 2) We searched for cached only and we didn't find
4814                          * anything, and we didn't start any caching kthreads
4815                          * either, so chances are we will loop through and
4816                          * start a couple caching kthreads, and then come back
4817                          * around and just wait for them.  This will be slower
4818                          * because we will have 2 caching kthreads reading at
4819                          * the same time when we could have just started one
4820                          * and waited for it to get far enough to give us an
4821                          * allocation, so go ahead and go to the wait caching
4822                          * loop.
4823                          */
4824                         loop = LOOP_CACHING_WAIT;
4825                         search_start = ideal_cache_offset;
4826                         ideal_cache_percent = 0;
4827                         goto ideal_cache;
4828                 } else if (loop == LOOP_FIND_IDEAL) {
4829                         /*
4830                          * Didn't find a uncached bg, wait on anything we find
4831                          * next.
4832                          */
4833                         loop = LOOP_CACHING_WAIT;
4834                         goto search;
4835                 }
4836
4837                 if (loop < LOOP_CACHING_WAIT) {
4838                         loop++;
4839                         goto search;
4840                 }
4841
4842                 if (loop == LOOP_ALLOC_CHUNK) {
4843                         empty_size = 0;
4844                         empty_cluster = 0;
4845                 }
4846
4847                 if (allowed_chunk_alloc) {
4848                         ret = do_chunk_alloc(trans, root, num_bytes +
4849                                              2 * 1024 * 1024, data, 1);
4850                         allowed_chunk_alloc = 0;
4851                         done_chunk_alloc = 1;
4852                 } else if (!done_chunk_alloc) {
4853                         space_info->force_alloc = 1;
4854                 }
4855
4856                 if (loop < LOOP_NO_EMPTY_SIZE) {
4857                         loop++;
4858                         goto search;
4859                 }
4860                 ret = -ENOSPC;
4861         } else if (!ins->objectid) {
4862                 ret = -ENOSPC;
4863         }
4864
4865         /* we found what we needed */
4866         if (ins->objectid) {
4867                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
4868                         trans->block_group = block_group->key.objectid;
4869
4870                 btrfs_put_block_group(block_group);
4871                 ret = 0;
4872         }
4873
4874         return ret;
4875 }
4876
4877 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4878                             int dump_block_groups)
4879 {
4880         struct btrfs_block_group_cache *cache;
4881         int index = 0;
4882
4883         spin_lock(&info->lock);
4884         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4885                (unsigned long long)(info->total_bytes - info->bytes_used -
4886                                     info->bytes_pinned - info->bytes_reserved -
4887                                     info->bytes_super),
4888                (info->full) ? "" : "not ");
4889         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4890                " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4891                "\n",
4892                (unsigned long long)info->total_bytes,
4893                (unsigned long long)info->bytes_pinned,
4894                (unsigned long long)info->bytes_delalloc,
4895                (unsigned long long)info->bytes_may_use,
4896                (unsigned long long)info->bytes_used,
4897                (unsigned long long)info->bytes_root,
4898                (unsigned long long)info->bytes_super,
4899                (unsigned long long)info->bytes_reserved);
4900         spin_unlock(&info->lock);
4901
4902         if (!dump_block_groups)
4903                 return;
4904
4905         down_read(&info->groups_sem);
4906 again:
4907         list_for_each_entry(cache, &info->block_groups[index], list) {
4908                 spin_lock(&cache->lock);
4909                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4910                        "%llu pinned %llu reserved\n",
4911                        (unsigned long long)cache->key.objectid,
4912                        (unsigned long long)cache->key.offset,
4913                        (unsigned long long)btrfs_block_group_used(&cache->item),
4914                        (unsigned long long)cache->pinned,
4915                        (unsigned long long)cache->reserved);
4916                 btrfs_dump_free_space(cache, bytes);
4917                 spin_unlock(&cache->lock);
4918         }
4919         if (++index < BTRFS_NR_RAID_TYPES)
4920                 goto again;
4921         up_read(&info->groups_sem);
4922 }
4923
4924 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4925                          struct btrfs_root *root,
4926                          u64 num_bytes, u64 min_alloc_size,
4927                          u64 empty_size, u64 hint_byte,
4928                          u64 search_end, struct btrfs_key *ins,
4929                          u64 data)
4930 {
4931         int ret;
4932         u64 search_start = 0;
4933
4934         data = btrfs_get_alloc_profile(root, data);
4935 again:
4936         /*
4937          * the only place that sets empty_size is btrfs_realloc_node, which
4938          * is not called recursively on allocations
4939          */
4940         if (empty_size || root->ref_cows)
4941                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4942                                      num_bytes + 2 * 1024 * 1024, data, 0);
4943
4944         WARN_ON(num_bytes < root->sectorsize);
4945         ret = find_free_extent(trans, root, num_bytes, empty_size,
4946                                search_start, search_end, hint_byte,
4947                                ins, data);
4948
4949         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4950                 num_bytes = num_bytes >> 1;
4951                 num_bytes = num_bytes & ~(root->sectorsize - 1);
4952                 num_bytes = max(num_bytes, min_alloc_size);
4953                 do_chunk_alloc(trans, root->fs_info->extent_root,
4954                                num_bytes, data, 1);
4955                 goto again;
4956         }
4957         if (ret == -ENOSPC) {
4958                 struct btrfs_space_info *sinfo;
4959
4960                 sinfo = __find_space_info(root->fs_info, data);
4961                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4962                        "wanted %llu\n", (unsigned long long)data,
4963                        (unsigned long long)num_bytes);
4964                 dump_space_info(sinfo, num_bytes, 1);
4965         }
4966
4967         return ret;
4968 }
4969
4970 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4971 {
4972         struct btrfs_block_group_cache *cache;
4973         int ret = 0;
4974
4975         cache = btrfs_lookup_block_group(root->fs_info, start);
4976         if (!cache) {
4977                 printk(KERN_ERR "Unable to find block group for %llu\n",
4978                        (unsigned long long)start);
4979                 return -ENOSPC;
4980         }
4981
4982         ret = btrfs_discard_extent(root, start, len);
4983
4984         btrfs_add_free_space(cache, start, len);
4985         update_reserved_bytes(cache, len, 0, 1);
4986         btrfs_put_block_group(cache);
4987
4988         return ret;
4989 }
4990
4991 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4992                                       struct btrfs_root *root,
4993                                       u64 parent, u64 root_objectid,
4994                                       u64 flags, u64 owner, u64 offset,
4995                                       struct btrfs_key *ins, int ref_mod)
4996 {
4997         int ret;
4998         struct btrfs_fs_info *fs_info = root->fs_info;
4999         struct btrfs_extent_item *extent_item;
5000         struct btrfs_extent_inline_ref *iref;
5001         struct btrfs_path *path;
5002         struct extent_buffer *leaf;
5003         int type;
5004         u32 size;
5005
5006         if (parent > 0)
5007                 type = BTRFS_SHARED_DATA_REF_KEY;
5008         else
5009                 type = BTRFS_EXTENT_DATA_REF_KEY;
5010
5011         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5012
5013         path = btrfs_alloc_path();
5014         BUG_ON(!path);
5015
5016         path->leave_spinning = 1;
5017         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5018                                       ins, size);
5019         BUG_ON(ret);
5020
5021         leaf = path->nodes[0];
5022         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5023                                      struct btrfs_extent_item);
5024         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5025         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5026         btrfs_set_extent_flags(leaf, extent_item,
5027                                flags | BTRFS_EXTENT_FLAG_DATA);
5028
5029         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5030         btrfs_set_extent_inline_ref_type(leaf, iref, type);
5031         if (parent > 0) {
5032                 struct btrfs_shared_data_ref *ref;
5033                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
5034                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5035                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5036         } else {
5037                 struct btrfs_extent_data_ref *ref;
5038                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5039                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5040                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5041                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5042                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5043         }
5044
5045         btrfs_mark_buffer_dirty(path->nodes[0]);
5046         btrfs_free_path(path);
5047
5048         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5049         if (ret) {
5050                 printk(KERN_ERR "btrfs update block group failed for %llu "
5051                        "%llu\n", (unsigned long long)ins->objectid,
5052                        (unsigned long long)ins->offset);
5053                 BUG();
5054         }
5055         return ret;
5056 }
5057
5058 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5059                                      struct btrfs_root *root,
5060                                      u64 parent, u64 root_objectid,
5061                                      u64 flags, struct btrfs_disk_key *key,
5062                                      int level, struct btrfs_key *ins)
5063 {
5064         int ret;
5065         struct btrfs_fs_info *fs_info = root->fs_info;
5066         struct btrfs_extent_item *extent_item;
5067         struct btrfs_tree_block_info *block_info;
5068         struct btrfs_extent_inline_ref *iref;
5069         struct btrfs_path *path;
5070         struct extent_buffer *leaf;
5071         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5072
5073         path = btrfs_alloc_path();
5074         BUG_ON(!path);
5075
5076         path->leave_spinning = 1;
5077         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5078                                       ins, size);
5079         BUG_ON(ret);
5080
5081         leaf = path->nodes[0];
5082         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5083                                      struct btrfs_extent_item);
5084         btrfs_set_extent_refs(leaf, extent_item, 1);
5085         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5086         btrfs_set_extent_flags(leaf, extent_item,
5087                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5088         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5089
5090         btrfs_set_tree_block_key(leaf, block_info, key);
5091         btrfs_set_tree_block_level(leaf, block_info, level);
5092
5093         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5094         if (parent > 0) {
5095                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5096                 btrfs_set_extent_inline_ref_type(leaf, iref,
5097                                                  BTRFS_SHARED_BLOCK_REF_KEY);
5098                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5099         } else {
5100                 btrfs_set_extent_inline_ref_type(leaf, iref,
5101                                                  BTRFS_TREE_BLOCK_REF_KEY);
5102                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5103         }
5104
5105         btrfs_mark_buffer_dirty(leaf);
5106         btrfs_free_path(path);
5107
5108         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5109         if (ret) {
5110                 printk(KERN_ERR "btrfs update block group failed for %llu "
5111                        "%llu\n", (unsigned long long)ins->objectid,
5112                        (unsigned long long)ins->offset);
5113                 BUG();
5114         }
5115         return ret;
5116 }
5117
5118 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5119                                      struct btrfs_root *root,
5120                                      u64 root_objectid, u64 owner,
5121                                      u64 offset, struct btrfs_key *ins)
5122 {
5123         int ret;
5124
5125         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5126
5127         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5128                                          0, root_objectid, owner, offset,
5129                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
5130         return ret;
5131 }
5132
5133 /*
5134  * this is used by the tree logging recovery code.  It records that
5135  * an extent has been allocated and makes sure to clear the free
5136  * space cache bits as well
5137  */
5138 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5139                                    struct btrfs_root *root,
5140                                    u64 root_objectid, u64 owner, u64 offset,
5141                                    struct btrfs_key *ins)
5142 {
5143         int ret;
5144         struct btrfs_block_group_cache *block_group;
5145         struct btrfs_caching_control *caching_ctl;
5146         u64 start = ins->objectid;
5147         u64 num_bytes = ins->offset;
5148
5149         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5150         cache_block_group(block_group);
5151         caching_ctl = get_caching_control(block_group);
5152
5153         if (!caching_ctl) {
5154                 BUG_ON(!block_group_cache_done(block_group));
5155                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5156                 BUG_ON(ret);
5157         } else {
5158                 mutex_lock(&caching_ctl->mutex);
5159
5160                 if (start >= caching_ctl->progress) {
5161                         ret = add_excluded_extent(root, start, num_bytes);
5162                         BUG_ON(ret);
5163                 } else if (start + num_bytes <= caching_ctl->progress) {
5164                         ret = btrfs_remove_free_space(block_group,
5165                                                       start, num_bytes);
5166                         BUG_ON(ret);
5167                 } else {
5168                         num_bytes = caching_ctl->progress - start;
5169                         ret = btrfs_remove_free_space(block_group,
5170                                                       start, num_bytes);
5171                         BUG_ON(ret);
5172
5173                         start = caching_ctl->progress;
5174                         num_bytes = ins->objectid + ins->offset -
5175                                     caching_ctl->progress;
5176                         ret = add_excluded_extent(root, start, num_bytes);
5177                         BUG_ON(ret);
5178                 }
5179
5180                 mutex_unlock(&caching_ctl->mutex);
5181                 put_caching_control(caching_ctl);
5182         }
5183
5184         ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5185         BUG_ON(ret);
5186         btrfs_put_block_group(block_group);
5187         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5188                                          0, owner, offset, ins, 1);
5189         return ret;
5190 }
5191
5192 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5193                                             struct btrfs_root *root,
5194                                             u64 bytenr, u32 blocksize,
5195                                             int level)
5196 {
5197         struct extent_buffer *buf;
5198
5199         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5200         if (!buf)
5201                 return ERR_PTR(-ENOMEM);
5202         btrfs_set_header_generation(buf, trans->transid);
5203         btrfs_set_buffer_lockdep_class(buf, level);
5204         btrfs_tree_lock(buf);
5205         clean_tree_block(trans, root, buf);
5206
5207         btrfs_set_lock_blocking(buf);
5208         btrfs_set_buffer_uptodate(buf);
5209
5210         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5211                 /*
5212                  * we allow two log transactions at a time, use different
5213                  * EXENT bit to differentiate dirty pages.
5214                  */
5215                 if (root->log_transid % 2 == 0)
5216                         set_extent_dirty(&root->dirty_log_pages, buf->start,
5217                                         buf->start + buf->len - 1, GFP_NOFS);
5218                 else
5219                         set_extent_new(&root->dirty_log_pages, buf->start,
5220                                         buf->start + buf->len - 1, GFP_NOFS);
5221         } else {
5222                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5223                          buf->start + buf->len - 1, GFP_NOFS);
5224         }
5225         trans->blocks_used++;
5226         /* this returns a buffer locked for blocking */
5227         return buf;
5228 }
5229
5230 static struct btrfs_block_rsv *
5231 use_block_rsv(struct btrfs_trans_handle *trans,
5232               struct btrfs_root *root, u32 blocksize)
5233 {
5234         struct btrfs_block_rsv *block_rsv;
5235         int ret;
5236
5237         block_rsv = get_block_rsv(trans, root);
5238
5239         if (block_rsv->size == 0) {
5240                 ret = reserve_metadata_bytes(block_rsv, blocksize);
5241                 if (ret)
5242                         return ERR_PTR(ret);
5243                 return block_rsv;
5244         }
5245
5246         ret = block_rsv_use_bytes(block_rsv, blocksize);
5247         if (!ret)
5248                 return block_rsv;
5249
5250         WARN_ON(1);
5251         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5252                 block_rsv->size, block_rsv->reserved,
5253                 block_rsv->freed[0], block_rsv->freed[1]);
5254
5255         return ERR_PTR(-ENOSPC);
5256 }
5257
5258 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5259 {
5260         block_rsv_add_bytes(block_rsv, blocksize, 0);
5261         block_rsv_release_bytes(block_rsv, NULL, 0);
5262 }
5263
5264 /*
5265  * finds a free extent and does all the dirty work required for allocation
5266  * returns the key for the extent through ins, and a tree buffer for
5267  * the first block of the extent through buf.
5268  *
5269  * returns the tree buffer or NULL.
5270  */
5271 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5272                                         struct btrfs_root *root, u32 blocksize,
5273                                         u64 parent, u64 root_objectid,
5274                                         struct btrfs_disk_key *key, int level,
5275                                         u64 hint, u64 empty_size)
5276 {
5277         struct btrfs_key ins;
5278         struct btrfs_block_rsv *block_rsv;
5279         struct extent_buffer *buf;
5280         u64 flags = 0;
5281         int ret;
5282
5283
5284         block_rsv = use_block_rsv(trans, root, blocksize);
5285         if (IS_ERR(block_rsv))
5286                 return ERR_CAST(block_rsv);
5287
5288         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5289                                    empty_size, hint, (u64)-1, &ins, 0);
5290         if (ret) {
5291                 unuse_block_rsv(block_rsv, blocksize);
5292                 return ERR_PTR(ret);
5293         }
5294
5295         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5296                                     blocksize, level);
5297         BUG_ON(IS_ERR(buf));
5298
5299         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5300                 if (parent == 0)
5301                         parent = ins.objectid;
5302                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5303         } else
5304                 BUG_ON(parent > 0);
5305
5306         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5307                 struct btrfs_delayed_extent_op *extent_op;
5308                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5309                 BUG_ON(!extent_op);
5310                 if (key)
5311                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
5312                 else
5313                         memset(&extent_op->key, 0, sizeof(extent_op->key));
5314                 extent_op->flags_to_set = flags;
5315                 extent_op->update_key = 1;
5316                 extent_op->update_flags = 1;
5317                 extent_op->is_data = 0;
5318
5319                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5320                                         ins.offset, parent, root_objectid,
5321                                         level, BTRFS_ADD_DELAYED_EXTENT,
5322                                         extent_op);
5323                 BUG_ON(ret);
5324         }
5325         return buf;
5326 }
5327
5328 struct walk_control {
5329         u64 refs[BTRFS_MAX_LEVEL];
5330         u64 flags[BTRFS_MAX_LEVEL];
5331         struct btrfs_key update_progress;
5332         int stage;
5333         int level;
5334         int shared_level;
5335         int update_ref;
5336         int keep_locks;
5337         int reada_slot;
5338         int reada_count;
5339 };
5340
5341 #define DROP_REFERENCE  1
5342 #define UPDATE_BACKREF  2
5343
5344 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5345                                      struct btrfs_root *root,
5346                                      struct walk_control *wc,
5347                                      struct btrfs_path *path)
5348 {
5349         u64 bytenr;
5350         u64 generation;
5351         u64 refs;
5352         u64 flags;
5353         u64 last = 0;
5354         u32 nritems;
5355         u32 blocksize;
5356         struct btrfs_key key;
5357         struct extent_buffer *eb;
5358         int ret;
5359         int slot;
5360         int nread = 0;
5361
5362         if (path->slots[wc->level] < wc->reada_slot) {
5363                 wc->reada_count = wc->reada_count * 2 / 3;
5364                 wc->reada_count = max(wc->reada_count, 2);
5365         } else {
5366                 wc->reada_count = wc->reada_count * 3 / 2;
5367                 wc->reada_count = min_t(int, wc->reada_count,
5368                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5369         }
5370
5371         eb = path->nodes[wc->level];
5372         nritems = btrfs_header_nritems(eb);
5373         blocksize = btrfs_level_size(root, wc->level - 1);
5374
5375         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5376                 if (nread >= wc->reada_count)
5377                         break;
5378
5379                 cond_resched();
5380                 bytenr = btrfs_node_blockptr(eb, slot);
5381                 generation = btrfs_node_ptr_generation(eb, slot);
5382
5383                 if (slot == path->slots[wc->level])
5384                         goto reada;
5385
5386                 if (wc->stage == UPDATE_BACKREF &&
5387                     generation <= root->root_key.offset)
5388                         continue;
5389
5390                 /* We don't lock the tree block, it's OK to be racy here */
5391                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5392                                                &refs, &flags);
5393                 BUG_ON(ret);
5394                 BUG_ON(refs == 0);
5395
5396                 if (wc->stage == DROP_REFERENCE) {
5397                         if (refs == 1)
5398                                 goto reada;
5399
5400                         if (wc->level == 1 &&
5401                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5402                                 continue;
5403                         if (!wc->update_ref ||
5404                             generation <= root->root_key.offset)
5405                                 continue;
5406                         btrfs_node_key_to_cpu(eb, &key, slot);
5407                         ret = btrfs_comp_cpu_keys(&key,
5408                                                   &wc->update_progress);
5409                         if (ret < 0)
5410                                 continue;
5411                 } else {
5412                         if (wc->level == 1 &&
5413                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5414                                 continue;
5415                 }
5416 reada:
5417                 ret = readahead_tree_block(root, bytenr, blocksize,
5418                                            generation);
5419                 if (ret)
5420                         break;
5421                 last = bytenr + blocksize;
5422                 nread++;
5423         }
5424         wc->reada_slot = slot;
5425 }
5426
5427 /*
5428  * hepler to process tree block while walking down the tree.
5429  *
5430  * when wc->stage == UPDATE_BACKREF, this function updates
5431  * back refs for pointers in the block.
5432  *
5433  * NOTE: return value 1 means we should stop walking down.
5434  */
5435 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5436                                    struct btrfs_root *root,
5437                                    struct btrfs_path *path,
5438                                    struct walk_control *wc, int lookup_info)
5439 {
5440         int level = wc->level;
5441         struct extent_buffer *eb = path->nodes[level];
5442         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5443         int ret;
5444
5445         if (wc->stage == UPDATE_BACKREF &&
5446             btrfs_header_owner(eb) != root->root_key.objectid)
5447                 return 1;
5448
5449         /*
5450          * when reference count of tree block is 1, it won't increase
5451          * again. once full backref flag is set, we never clear it.
5452          */
5453         if (lookup_info &&
5454             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5455              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5456                 BUG_ON(!path->locks[level]);
5457                 ret = btrfs_lookup_extent_info(trans, root,
5458                                                eb->start, eb->len,
5459                                                &wc->refs[level],
5460                                                &wc->flags[level]);
5461                 BUG_ON(ret);
5462                 BUG_ON(wc->refs[level] == 0);
5463         }
5464
5465         if (wc->stage == DROP_REFERENCE) {
5466                 if (wc->refs[level] > 1)
5467                         return 1;
5468
5469                 if (path->locks[level] && !wc->keep_locks) {
5470                         btrfs_tree_unlock(eb);
5471                         path->locks[level] = 0;
5472                 }
5473                 return 0;
5474         }
5475
5476         /* wc->stage == UPDATE_BACKREF */
5477         if (!(wc->flags[level] & flag)) {
5478                 BUG_ON(!path->locks[level]);
5479                 ret = btrfs_inc_ref(trans, root, eb, 1);
5480                 BUG_ON(ret);
5481                 ret = btrfs_dec_ref(trans, root, eb, 0);
5482                 BUG_ON(ret);
5483                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5484                                                   eb->len, flag, 0);
5485                 BUG_ON(ret);
5486                 wc->flags[level] |= flag;
5487         }
5488
5489         /*
5490          * the block is shared by multiple trees, so it's not good to
5491          * keep the tree lock
5492          */
5493         if (path->locks[level] && level > 0) {
5494                 btrfs_tree_unlock(eb);
5495                 path->locks[level] = 0;
5496         }
5497         return 0;
5498 }
5499
5500 /*
5501  * hepler to process tree block pointer.
5502  *
5503  * when wc->stage == DROP_REFERENCE, this function checks
5504  * reference count of the block pointed to. if the block
5505  * is shared and we need update back refs for the subtree
5506  * rooted at the block, this function changes wc->stage to
5507  * UPDATE_BACKREF. if the block is shared and there is no
5508  * need to update back, this function drops the reference
5509  * to the block.
5510  *
5511  * NOTE: return value 1 means we should stop walking down.
5512  */
5513 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5514                                  struct btrfs_root *root,
5515                                  struct btrfs_path *path,
5516                                  struct walk_control *wc, int *lookup_info)
5517 {
5518         u64 bytenr;
5519         u64 generation;
5520         u64 parent;
5521         u32 blocksize;
5522         struct btrfs_key key;
5523         struct extent_buffer *next;
5524         int level = wc->level;
5525         int reada = 0;
5526         int ret = 0;
5527
5528         generation = btrfs_node_ptr_generation(path->nodes[level],
5529                                                path->slots[level]);
5530         /*
5531          * if the lower level block was created before the snapshot
5532          * was created, we know there is no need to update back refs
5533          * for the subtree
5534          */
5535         if (wc->stage == UPDATE_BACKREF &&
5536             generation <= root->root_key.offset) {
5537                 *lookup_info = 1;
5538                 return 1;
5539         }
5540
5541         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5542         blocksize = btrfs_level_size(root, level - 1);
5543
5544         next = btrfs_find_tree_block(root, bytenr, blocksize);
5545         if (!next) {
5546                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5547                 if (!next)
5548                         return -ENOMEM;
5549                 reada = 1;
5550         }
5551         btrfs_tree_lock(next);
5552         btrfs_set_lock_blocking(next);
5553
5554         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5555                                        &wc->refs[level - 1],
5556                                        &wc->flags[level - 1]);
5557         BUG_ON(ret);
5558         BUG_ON(wc->refs[level - 1] == 0);
5559         *lookup_info = 0;
5560
5561         if (wc->stage == DROP_REFERENCE) {
5562                 if (wc->refs[level - 1] > 1) {
5563                         if (level == 1 &&
5564                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5565                                 goto skip;
5566
5567                         if (!wc->update_ref ||
5568                             generation <= root->root_key.offset)
5569                                 goto skip;
5570
5571                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5572                                               path->slots[level]);
5573                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5574                         if (ret < 0)
5575                                 goto skip;
5576
5577                         wc->stage = UPDATE_BACKREF;
5578                         wc->shared_level = level - 1;
5579                 }
5580         } else {
5581                 if (level == 1 &&
5582                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5583                         goto skip;
5584         }
5585
5586         if (!btrfs_buffer_uptodate(next, generation)) {
5587                 btrfs_tree_unlock(next);
5588                 free_extent_buffer(next);
5589                 next = NULL;
5590                 *lookup_info = 1;
5591         }
5592
5593         if (!next) {
5594                 if (reada && level == 1)
5595                         reada_walk_down(trans, root, wc, path);
5596                 next = read_tree_block(root, bytenr, blocksize, generation);
5597                 btrfs_tree_lock(next);
5598                 btrfs_set_lock_blocking(next);
5599         }
5600
5601         level--;
5602         BUG_ON(level != btrfs_header_level(next));
5603         path->nodes[level] = next;
5604         path->slots[level] = 0;
5605         path->locks[level] = 1;
5606         wc->level = level;
5607         if (wc->level == 1)
5608                 wc->reada_slot = 0;
5609         return 0;
5610 skip:
5611         wc->refs[level - 1] = 0;
5612         wc->flags[level - 1] = 0;
5613         if (wc->stage == DROP_REFERENCE) {
5614                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5615                         parent = path->nodes[level]->start;
5616                 } else {
5617                         BUG_ON(root->root_key.objectid !=
5618                                btrfs_header_owner(path->nodes[level]));
5619                         parent = 0;
5620                 }
5621
5622                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5623                                         root->root_key.objectid, level - 1, 0);
5624                 BUG_ON(ret);
5625         }
5626         btrfs_tree_unlock(next);
5627         free_extent_buffer(next);
5628         *lookup_info = 1;
5629         return 1;
5630 }
5631
5632 /*
5633  * hepler to process tree block while walking up the tree.
5634  *
5635  * when wc->stage == DROP_REFERENCE, this function drops
5636  * reference count on the block.
5637  *
5638  * when wc->stage == UPDATE_BACKREF, this function changes
5639  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5640  * to UPDATE_BACKREF previously while processing the block.
5641  *
5642  * NOTE: return value 1 means we should stop walking up.
5643  */
5644 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5645                                  struct btrfs_root *root,
5646                                  struct btrfs_path *path,
5647                                  struct walk_control *wc)
5648 {
5649         int ret;
5650         int level = wc->level;
5651         struct extent_buffer *eb = path->nodes[level];
5652         u64 parent = 0;
5653
5654         if (wc->stage == UPDATE_BACKREF) {
5655                 BUG_ON(wc->shared_level < level);
5656                 if (level < wc->shared_level)
5657                         goto out;
5658
5659                 ret = find_next_key(path, level + 1, &wc->update_progress);
5660                 if (ret > 0)
5661                         wc->update_ref = 0;
5662
5663                 wc->stage = DROP_REFERENCE;
5664                 wc->shared_level = -1;
5665                 path->slots[level] = 0;
5666
5667                 /*
5668                  * check reference count again if the block isn't locked.
5669                  * we should start walking down the tree again if reference
5670                  * count is one.
5671                  */
5672                 if (!path->locks[level]) {
5673                         BUG_ON(level == 0);
5674                         btrfs_tree_lock(eb);
5675                         btrfs_set_lock_blocking(eb);
5676                         path->locks[level] = 1;
5677
5678                         ret = btrfs_lookup_extent_info(trans, root,
5679                                                        eb->start, eb->len,
5680                                                        &wc->refs[level],
5681                                                        &wc->flags[level]);
5682                         BUG_ON(ret);
5683                         BUG_ON(wc->refs[level] == 0);
5684                         if (wc->refs[level] == 1) {
5685                                 btrfs_tree_unlock(eb);
5686                                 path->locks[level] = 0;
5687                                 return 1;
5688                         }
5689                 }
5690         }
5691
5692         /* wc->stage == DROP_REFERENCE */
5693         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5694
5695         if (wc->refs[level] == 1) {
5696                 if (level == 0) {
5697                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5698                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5699                         else
5700                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5701                         BUG_ON(ret);
5702                 }
5703                 /* make block locked assertion in clean_tree_block happy */
5704                 if (!path->locks[level] &&
5705                     btrfs_header_generation(eb) == trans->transid) {
5706                         btrfs_tree_lock(eb);
5707                         btrfs_set_lock_blocking(eb);
5708                         path->locks[level] = 1;
5709                 }
5710                 clean_tree_block(trans, root, eb);
5711         }
5712
5713         if (eb == root->node) {
5714                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5715                         parent = eb->start;
5716                 else
5717                         BUG_ON(root->root_key.objectid !=
5718                                btrfs_header_owner(eb));
5719         } else {
5720                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5721                         parent = path->nodes[level + 1]->start;
5722                 else
5723                         BUG_ON(root->root_key.objectid !=
5724                                btrfs_header_owner(path->nodes[level + 1]));
5725         }
5726
5727         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5728 out:
5729         wc->refs[level] = 0;
5730         wc->flags[level] = 0;
5731         return 0;
5732 }
5733
5734 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5735                                    struct btrfs_root *root,
5736                                    struct btrfs_path *path,
5737                                    struct walk_control *wc)
5738 {
5739         int level = wc->level;
5740         int lookup_info = 1;
5741         int ret;
5742
5743         while (level >= 0) {
5744                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5745                 if (ret > 0)
5746                         break;
5747
5748                 if (level == 0)
5749                         break;
5750
5751                 if (path->slots[level] >=
5752                     btrfs_header_nritems(path->nodes[level]))
5753                         break;
5754
5755                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5756                 if (ret > 0) {
5757                         path->slots[level]++;
5758                         continue;
5759                 } else if (ret < 0)
5760                         return ret;
5761                 level = wc->level;
5762         }
5763         return 0;
5764 }
5765
5766 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5767                                  struct btrfs_root *root,
5768                                  struct btrfs_path *path,
5769                                  struct walk_control *wc, int max_level)
5770 {
5771         int level = wc->level;
5772         int ret;
5773
5774         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5775         while (level < max_level && path->nodes[level]) {
5776                 wc->level = level;
5777                 if (path->slots[level] + 1 <
5778                     btrfs_header_nritems(path->nodes[level])) {
5779                         path->slots[level]++;
5780                         return 0;
5781                 } else {
5782                         ret = walk_up_proc(trans, root, path, wc);
5783                         if (ret > 0)
5784                                 return 0;
5785
5786                         if (path->locks[level]) {
5787                                 btrfs_tree_unlock(path->nodes[level]);
5788                                 path->locks[level] = 0;
5789                         }
5790                         free_extent_buffer(path->nodes[level]);
5791                         path->nodes[level] = NULL;
5792                         level++;
5793                 }
5794         }
5795         return 1;
5796 }
5797
5798 /*
5799  * drop a subvolume tree.
5800  *
5801  * this function traverses the tree freeing any blocks that only
5802  * referenced by the tree.
5803  *
5804  * when a shared tree block is found. this function decreases its
5805  * reference count by one. if update_ref is true, this function
5806  * also make sure backrefs for the shared block and all lower level
5807  * blocks are properly updated.
5808  */
5809 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5810 {
5811         struct btrfs_path *path;
5812         struct btrfs_trans_handle *trans;
5813         struct btrfs_root *tree_root = root->fs_info->tree_root;
5814         struct btrfs_root_item *root_item = &root->root_item;
5815         struct walk_control *wc;
5816         struct btrfs_key key;
5817         int err = 0;
5818         int ret;
5819         int level;
5820
5821         path = btrfs_alloc_path();
5822         BUG_ON(!path);
5823
5824         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5825         BUG_ON(!wc);
5826
5827         trans = btrfs_start_transaction(tree_root, 1);
5828
5829         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5830                 level = btrfs_header_level(root->node);
5831                 path->nodes[level] = btrfs_lock_root_node(root);
5832                 btrfs_set_lock_blocking(path->nodes[level]);
5833                 path->slots[level] = 0;
5834                 path->locks[level] = 1;
5835                 memset(&wc->update_progress, 0,
5836                        sizeof(wc->update_progress));
5837         } else {
5838                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5839                 memcpy(&wc->update_progress, &key,
5840                        sizeof(wc->update_progress));
5841
5842                 level = root_item->drop_level;
5843                 BUG_ON(level == 0);
5844                 path->lowest_level = level;
5845                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5846                 path->lowest_level = 0;
5847                 if (ret < 0) {
5848                         err = ret;
5849                         goto out;
5850                 }
5851                 WARN_ON(ret > 0);
5852
5853                 /*
5854                  * unlock our path, this is safe because only this
5855                  * function is allowed to delete this snapshot
5856                  */
5857                 btrfs_unlock_up_safe(path, 0);
5858
5859                 level = btrfs_header_level(root->node);
5860                 while (1) {
5861                         btrfs_tree_lock(path->nodes[level]);
5862                         btrfs_set_lock_blocking(path->nodes[level]);
5863
5864                         ret = btrfs_lookup_extent_info(trans, root,
5865                                                 path->nodes[level]->start,
5866                                                 path->nodes[level]->len,
5867                                                 &wc->refs[level],
5868                                                 &wc->flags[level]);
5869                         BUG_ON(ret);
5870                         BUG_ON(wc->refs[level] == 0);
5871
5872                         if (level == root_item->drop_level)
5873                                 break;
5874
5875                         btrfs_tree_unlock(path->nodes[level]);
5876                         WARN_ON(wc->refs[level] != 1);
5877                         level--;
5878                 }
5879         }
5880
5881         wc->level = level;
5882         wc->shared_level = -1;
5883         wc->stage = DROP_REFERENCE;
5884         wc->update_ref = update_ref;
5885         wc->keep_locks = 0;
5886         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5887
5888         while (1) {
5889                 ret = walk_down_tree(trans, root, path, wc);
5890                 if (ret < 0) {
5891                         err = ret;
5892                         break;
5893                 }
5894
5895                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5896                 if (ret < 0) {
5897                         err = ret;
5898                         break;
5899                 }
5900
5901                 if (ret > 0) {
5902                         BUG_ON(wc->stage != DROP_REFERENCE);
5903                         break;
5904                 }
5905
5906                 if (wc->stage == DROP_REFERENCE) {
5907                         level = wc->level;
5908                         btrfs_node_key(path->nodes[level],
5909                                        &root_item->drop_progress,
5910                                        path->slots[level]);
5911                         root_item->drop_level = level;
5912                 }
5913
5914                 BUG_ON(wc->level == 0);
5915                 if (trans->transaction->in_commit ||
5916                     trans->transaction->delayed_refs.flushing) {
5917                         ret = btrfs_update_root(trans, tree_root,
5918                                                 &root->root_key,
5919                                                 root_item);
5920                         BUG_ON(ret);
5921
5922                         btrfs_end_transaction(trans, tree_root);
5923                         trans = btrfs_start_transaction(tree_root, 1);
5924                 } else {
5925                         unsigned long update;
5926                         update = trans->delayed_ref_updates;
5927                         trans->delayed_ref_updates = 0;
5928                         if (update)
5929                                 btrfs_run_delayed_refs(trans, tree_root,
5930                                                        update);
5931                 }
5932         }
5933         btrfs_release_path(root, path);
5934         BUG_ON(err);
5935
5936         ret = btrfs_del_root(trans, tree_root, &root->root_key);
5937         BUG_ON(ret);
5938
5939         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5940                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5941                                            NULL, NULL);
5942                 BUG_ON(ret < 0);
5943                 if (ret > 0) {
5944                         ret = btrfs_del_orphan_item(trans, tree_root,
5945                                                     root->root_key.objectid);
5946                         BUG_ON(ret);
5947                 }
5948         }
5949
5950         if (root->in_radix) {
5951                 btrfs_free_fs_root(tree_root->fs_info, root);
5952         } else {
5953                 free_extent_buffer(root->node);
5954                 free_extent_buffer(root->commit_root);
5955                 kfree(root);
5956         }
5957 out:
5958         btrfs_end_transaction(trans, tree_root);
5959         kfree(wc);
5960         btrfs_free_path(path);
5961         return err;
5962 }
5963
5964 /*
5965  * drop subtree rooted at tree block 'node'.
5966  *
5967  * NOTE: this function will unlock and release tree block 'node'
5968  */
5969 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5970                         struct btrfs_root *root,
5971                         struct extent_buffer *node,
5972                         struct extent_buffer *parent)
5973 {
5974         struct btrfs_path *path;
5975         struct walk_control *wc;
5976         int level;
5977         int parent_level;
5978         int ret = 0;
5979         int wret;
5980
5981         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5982
5983         path = btrfs_alloc_path();
5984         BUG_ON(!path);
5985
5986         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5987         BUG_ON(!wc);
5988
5989         btrfs_assert_tree_locked(parent);
5990         parent_level = btrfs_header_level(parent);
5991         extent_buffer_get(parent);
5992         path->nodes[parent_level] = parent;
5993         path->slots[parent_level] = btrfs_header_nritems(parent);
5994
5995         btrfs_assert_tree_locked(node);
5996         level = btrfs_header_level(node);
5997         path->nodes[level] = node;
5998         path->slots[level] = 0;
5999         path->locks[level] = 1;
6000
6001         wc->refs[parent_level] = 1;
6002         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6003         wc->level = level;
6004         wc->shared_level = -1;
6005         wc->stage = DROP_REFERENCE;
6006         wc->update_ref = 0;
6007         wc->keep_locks = 1;
6008         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6009
6010         while (1) {
6011                 wret = walk_down_tree(trans, root, path, wc);
6012                 if (wret < 0) {
6013                         ret = wret;
6014                         break;
6015                 }
6016
6017                 wret = walk_up_tree(trans, root, path, wc, parent_level);
6018                 if (wret < 0)
6019                         ret = wret;
6020                 if (wret != 0)
6021                         break;
6022         }
6023
6024         kfree(wc);
6025         btrfs_free_path(path);
6026         return ret;
6027 }
6028
6029 #if 0
6030 static unsigned long calc_ra(unsigned long start, unsigned long last,
6031                              unsigned long nr)
6032 {
6033         return min(last, start + nr - 1);
6034 }
6035
6036 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6037                                          u64 len)
6038 {
6039         u64 page_start;
6040         u64 page_end;
6041         unsigned long first_index;
6042         unsigned long last_index;
6043         unsigned long i;
6044         struct page *page;
6045         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6046         struct file_ra_state *ra;
6047         struct btrfs_ordered_extent *ordered;
6048         unsigned int total_read = 0;
6049         unsigned int total_dirty = 0;
6050         int ret = 0;
6051
6052         ra = kzalloc(sizeof(*ra), GFP_NOFS);
6053
6054         mutex_lock(&inode->i_mutex);
6055         first_index = start >> PAGE_CACHE_SHIFT;
6056         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
6057
6058         /* make sure the dirty trick played by the caller work */
6059         ret = invalidate_inode_pages2_range(inode->i_mapping,
6060                                             first_index, last_index);
6061         if (ret)
6062                 goto out_unlock;
6063
6064         file_ra_state_init(ra, inode->i_mapping);
6065
6066         for (i = first_index ; i <= last_index; i++) {
6067                 if (total_read % ra->ra_pages == 0) {
6068                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
6069                                        calc_ra(i, last_index, ra->ra_pages));
6070                 }
6071                 total_read++;
6072 again:
6073                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
6074                         BUG_ON(1);
6075                 page = grab_cache_page(inode->i_mapping, i);
6076                 if (!page) {
6077                         ret = -ENOMEM;
6078                         goto out_unlock;
6079                 }
6080                 if (!PageUptodate(page)) {
6081                         btrfs_readpage(NULL, page);
6082                         lock_page(page);
6083                         if (!PageUptodate(page)) {
6084                                 unlock_page(page);
6085                                 page_cache_release(page);
6086                                 ret = -EIO;
6087                                 goto out_unlock;
6088                         }
6089                 }
6090                 wait_on_page_writeback(page);
6091
6092                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
6093                 page_end = page_start + PAGE_CACHE_SIZE - 1;
6094                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
6095
6096                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
6097                 if (ordered) {
6098                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6099                         unlock_page(page);
6100                         page_cache_release(page);
6101                         btrfs_start_ordered_extent(inode, ordered, 1);
6102                         btrfs_put_ordered_extent(ordered);
6103                         goto again;
6104                 }
6105                 set_page_extent_mapped(page);
6106
6107                 if (i == first_index)
6108                         set_extent_bits(io_tree, page_start, page_end,
6109                                         EXTENT_BOUNDARY, GFP_NOFS);
6110                 btrfs_set_extent_delalloc(inode, page_start, page_end);
6111
6112                 set_page_dirty(page);
6113                 total_dirty++;
6114
6115                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6116                 unlock_page(page);
6117                 page_cache_release(page);
6118         }
6119
6120 out_unlock:
6121         kfree(ra);
6122         mutex_unlock(&inode->i_mutex);
6123         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
6124         return ret;
6125 }
6126
6127 static noinline int relocate_data_extent(struct inode *reloc_inode,
6128                                          struct btrfs_key *extent_key,
6129                                          u64 offset)
6130 {
6131         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6132         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
6133         struct extent_map *em;
6134         u64 start = extent_key->objectid - offset;
6135         u64 end = start + extent_key->offset - 1;
6136
6137         em = alloc_extent_map(GFP_NOFS);
6138         BUG_ON(!em || IS_ERR(em));
6139
6140         em->start = start;
6141         em->len = extent_key->offset;
6142         em->block_len = extent_key->offset;
6143         em->block_start = extent_key->objectid;
6144         em->bdev = root->fs_info->fs_devices->latest_bdev;
6145         set_bit(EXTENT_FLAG_PINNED, &em->flags);
6146
6147         /* setup extent map to cheat btrfs_readpage */
6148         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6149         while (1) {
6150                 int ret;
6151                 write_lock(&em_tree->lock);
6152                 ret = add_extent_mapping(em_tree, em);
6153                 write_unlock(&em_tree->lock);
6154                 if (ret != -EEXIST) {
6155                         free_extent_map(em);
6156                         break;
6157                 }
6158                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
6159         }
6160         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6161
6162         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
6163 }
6164
6165 struct btrfs_ref_path {
6166         u64 extent_start;
6167         u64 nodes[BTRFS_MAX_LEVEL];
6168         u64 root_objectid;
6169         u64 root_generation;
6170         u64 owner_objectid;
6171         u32 num_refs;
6172         int lowest_level;
6173         int current_level;
6174         int shared_level;
6175
6176         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
6177         u64 new_nodes[BTRFS_MAX_LEVEL];
6178 };
6179
6180 struct disk_extent {
6181         u64 ram_bytes;
6182         u64 disk_bytenr;
6183         u64 disk_num_bytes;
6184         u64 offset;
6185         u64 num_bytes;
6186         u8 compression;
6187         u8 encryption;
6188         u16 other_encoding;
6189 };
6190
6191 static int is_cowonly_root(u64 root_objectid)
6192 {
6193         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
6194             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
6195             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
6196             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
6197             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6198             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
6199                 return 1;
6200         return 0;
6201 }
6202
6203 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
6204                                     struct btrfs_root *extent_root,
6205                                     struct btrfs_ref_path *ref_path,
6206                                     int first_time)
6207 {
6208         struct extent_buffer *leaf;
6209         struct btrfs_path *path;
6210         struct btrfs_extent_ref *ref;
6211         struct btrfs_key key;
6212         struct btrfs_key found_key;
6213         u64 bytenr;
6214         u32 nritems;
6215         int level;
6216         int ret = 1;
6217
6218         path = btrfs_alloc_path();
6219         if (!path)
6220                 return -ENOMEM;
6221
6222         if (first_time) {
6223                 ref_path->lowest_level = -1;
6224                 ref_path->current_level = -1;
6225                 ref_path->shared_level = -1;
6226                 goto walk_up;
6227         }
6228 walk_down:
6229         level = ref_path->current_level - 1;
6230         while (level >= -1) {
6231                 u64 parent;
6232                 if (level < ref_path->lowest_level)
6233                         break;
6234
6235                 if (level >= 0)
6236                         bytenr = ref_path->nodes[level];
6237                 else
6238                         bytenr = ref_path->extent_start;
6239                 BUG_ON(bytenr == 0);
6240
6241                 parent = ref_path->nodes[level + 1];
6242                 ref_path->nodes[level + 1] = 0;
6243                 ref_path->current_level = level;
6244                 BUG_ON(parent == 0);
6245
6246                 key.objectid = bytenr;
6247                 key.offset = parent + 1;
6248                 key.type = BTRFS_EXTENT_REF_KEY;
6249
6250                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6251                 if (ret < 0)
6252                         goto out;
6253                 BUG_ON(ret == 0);
6254
6255                 leaf = path->nodes[0];
6256                 nritems = btrfs_header_nritems(leaf);
6257                 if (path->slots[0] >= nritems) {
6258                         ret = btrfs_next_leaf(extent_root, path);
6259                         if (ret < 0)
6260                                 goto out;
6261                         if (ret > 0)
6262                                 goto next;
6263                         leaf = path->nodes[0];
6264                 }
6265
6266                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6267                 if (found_key.objectid == bytenr &&
6268                     found_key.type == BTRFS_EXTENT_REF_KEY) {
6269                         if (level < ref_path->shared_level)
6270                                 ref_path->shared_level = level;
6271                         goto found;
6272                 }
6273 next:
6274                 level--;
6275                 btrfs_release_path(extent_root, path);
6276                 cond_resched();
6277         }
6278         /* reached lowest level */
6279         ret = 1;
6280         goto out;
6281 walk_up:
6282         level = ref_path->current_level;
6283         while (level < BTRFS_MAX_LEVEL - 1) {
6284                 u64 ref_objectid;
6285
6286                 if (level >= 0)
6287                         bytenr = ref_path->nodes[level];
6288                 else
6289                         bytenr = ref_path->extent_start;
6290
6291                 BUG_ON(bytenr == 0);
6292
6293                 key.objectid = bytenr;
6294                 key.offset = 0;
6295                 key.type = BTRFS_EXTENT_REF_KEY;
6296
6297                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6298                 if (ret < 0)
6299                         goto out;
6300
6301                 leaf = path->nodes[0];
6302                 nritems = btrfs_header_nritems(leaf);
6303                 if (path->slots[0] >= nritems) {
6304                         ret = btrfs_next_leaf(extent_root, path);
6305                         if (ret < 0)
6306                                 goto out;
6307                         if (ret > 0) {
6308                                 /* the extent was freed by someone */
6309                                 if (ref_path->lowest_level == level)
6310                                         goto out;
6311                                 btrfs_release_path(extent_root, path);
6312                                 goto walk_down;
6313                         }
6314                         leaf = path->nodes[0];
6315                 }
6316
6317                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6318                 if (found_key.objectid != bytenr ||
6319                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
6320                         /* the extent was freed by someone */
6321                         if (ref_path->lowest_level == level) {
6322                                 ret = 1;
6323                                 goto out;
6324                         }
6325                         btrfs_release_path(extent_root, path);
6326                         goto walk_down;
6327                 }
6328 found:
6329                 ref = btrfs_item_ptr(leaf, path->slots[0],
6330                                 struct btrfs_extent_ref);
6331                 ref_objectid = btrfs_ref_objectid(leaf, ref);
6332                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6333                         if (first_time) {
6334                                 level = (int)ref_objectid;
6335                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
6336                                 ref_path->lowest_level = level;
6337                                 ref_path->current_level = level;
6338                                 ref_path->nodes[level] = bytenr;
6339                         } else {
6340                                 WARN_ON(ref_objectid != level);
6341                         }
6342                 } else {
6343                         WARN_ON(level != -1);
6344                 }
6345                 first_time = 0;
6346
6347                 if (ref_path->lowest_level == level) {
6348                         ref_path->owner_objectid = ref_objectid;
6349                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6350                 }
6351
6352                 /*
6353                  * the block is tree root or the block isn't in reference
6354                  * counted tree.
6355                  */
6356                 if (found_key.objectid == found_key.offset ||
6357                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6358                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6359                         ref_path->root_generation =
6360                                 btrfs_ref_generation(leaf, ref);
6361                         if (level < 0) {
6362                                 /* special reference from the tree log */
6363                                 ref_path->nodes[0] = found_key.offset;
6364                                 ref_path->current_level = 0;
6365                         }
6366                         ret = 0;
6367                         goto out;
6368                 }
6369
6370                 level++;
6371                 BUG_ON(ref_path->nodes[level] != 0);
6372                 ref_path->nodes[level] = found_key.offset;
6373                 ref_path->current_level = level;
6374
6375                 /*
6376                  * the reference was created in the running transaction,
6377                  * no need to continue walking up.
6378                  */
6379                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6380                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6381                         ref_path->root_generation =
6382                                 btrfs_ref_generation(leaf, ref);
6383                         ret = 0;
6384                         goto out;
6385                 }
6386
6387                 btrfs_release_path(extent_root, path);
6388                 cond_resched();
6389         }
6390         /* reached max tree level, but no tree root found. */
6391         BUG();
6392 out:
6393         btrfs_free_path(path);
6394         return ret;
6395 }
6396
6397 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6398                                 struct btrfs_root *extent_root,
6399                                 struct btrfs_ref_path *ref_path,
6400                                 u64 extent_start)
6401 {
6402         memset(ref_path, 0, sizeof(*ref_path));
6403         ref_path->extent_start = extent_start;
6404
6405         return __next_ref_path(trans, extent_root, ref_path, 1);
6406 }
6407
6408 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6409                                struct btrfs_root *extent_root,
6410                                struct btrfs_ref_path *ref_path)
6411 {
6412         return __next_ref_path(trans, extent_root, ref_path, 0);
6413 }
6414
6415 static noinline int get_new_locations(struct inode *reloc_inode,
6416                                       struct btrfs_key *extent_key,
6417                                       u64 offset, int no_fragment,
6418                                       struct disk_extent **extents,
6419                                       int *nr_extents)
6420 {
6421         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6422         struct btrfs_path *path;
6423         struct btrfs_file_extent_item *fi;
6424         struct extent_buffer *leaf;
6425         struct disk_extent *exts = *extents;
6426         struct btrfs_key found_key;
6427         u64 cur_pos;
6428         u64 last_byte;
6429         u32 nritems;
6430         int nr = 0;
6431         int max = *nr_extents;
6432         int ret;
6433
6434         WARN_ON(!no_fragment && *extents);
6435         if (!exts) {
6436                 max = 1;
6437                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6438                 if (!exts)
6439                         return -ENOMEM;
6440         }
6441
6442         path = btrfs_alloc_path();
6443         BUG_ON(!path);
6444
6445         cur_pos = extent_key->objectid - offset;
6446         last_byte = extent_key->objectid + extent_key->offset;
6447         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6448                                        cur_pos, 0);
6449         if (ret < 0)
6450                 goto out;
6451         if (ret > 0) {
6452                 ret = -ENOENT;
6453                 goto out;
6454         }
6455
6456         while (1) {
6457                 leaf = path->nodes[0];
6458                 nritems = btrfs_header_nritems(leaf);
6459                 if (path->slots[0] >= nritems) {
6460                         ret = btrfs_next_leaf(root, path);
6461                         if (ret < 0)
6462                                 goto out;
6463                         if (ret > 0)
6464                                 break;
6465                         leaf = path->nodes[0];
6466                 }
6467
6468                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6469                 if (found_key.offset != cur_pos ||
6470                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
6471                     found_key.objectid != reloc_inode->i_ino)
6472                         break;
6473
6474                 fi = btrfs_item_ptr(leaf, path->slots[0],
6475                                     struct btrfs_file_extent_item);
6476                 if (btrfs_file_extent_type(leaf, fi) !=
6477                     BTRFS_FILE_EXTENT_REG ||
6478                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6479                         break;
6480
6481                 if (nr == max) {
6482                         struct disk_extent *old = exts;
6483                         max *= 2;
6484                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6485                         memcpy(exts, old, sizeof(*exts) * nr);
6486                         if (old != *extents)
6487                                 kfree(old);
6488                 }
6489
6490                 exts[nr].disk_bytenr =
6491                         btrfs_file_extent_disk_bytenr(leaf, fi);
6492                 exts[nr].disk_num_bytes =
6493                         btrfs_file_extent_disk_num_bytes(leaf, fi);
6494                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6495                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6496                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6497                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6498                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6499                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6500                                                                            fi);
6501                 BUG_ON(exts[nr].offset > 0);
6502                 BUG_ON(exts[nr].compression || exts[nr].encryption);
6503                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6504
6505                 cur_pos += exts[nr].num_bytes;
6506                 nr++;
6507
6508                 if (cur_pos + offset >= last_byte)
6509                         break;
6510
6511                 if (no_fragment) {
6512                         ret = 1;
6513                         goto out;
6514                 }
6515                 path->slots[0]++;
6516         }
6517
6518         BUG_ON(cur_pos + offset > last_byte);
6519         if (cur_pos + offset < last_byte) {
6520                 ret = -ENOENT;
6521                 goto out;
6522         }
6523         ret = 0;
6524 out:
6525         btrfs_free_path(path);
6526         if (ret) {
6527                 if (exts != *extents)
6528                         kfree(exts);
6529         } else {
6530                 *extents = exts;
6531                 *nr_extents = nr;
6532         }
6533         return ret;
6534 }
6535
6536 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6537                                         struct btrfs_root *root,
6538                                         struct btrfs_path *path,
6539                                         struct btrfs_key *extent_key,
6540                                         struct btrfs_key *leaf_key,
6541                                         struct btrfs_ref_path *ref_path,
6542                                         struct disk_extent *new_extents,
6543                                         int nr_extents)
6544 {
6545         struct extent_buffer *leaf;
6546         struct btrfs_file_extent_item *fi;
6547         struct inode *inode = NULL;
6548         struct btrfs_key key;
6549         u64 lock_start = 0;
6550         u64 lock_end = 0;
6551         u64 num_bytes;
6552         u64 ext_offset;
6553         u64 search_end = (u64)-1;
6554         u32 nritems;
6555         int nr_scaned = 0;
6556         int extent_locked = 0;
6557         int extent_type;
6558         int ret;
6559
6560         memcpy(&key, leaf_key, sizeof(key));
6561         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6562                 if (key.objectid < ref_path->owner_objectid ||
6563                     (key.objectid == ref_path->owner_objectid &&
6564                      key.type < BTRFS_EXTENT_DATA_KEY)) {
6565                         key.objectid = ref_path->owner_objectid;
6566                         key.type = BTRFS_EXTENT_DATA_KEY;
6567                         key.offset = 0;
6568                 }
6569         }
6570
6571         while (1) {
6572                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6573                 if (ret < 0)
6574                         goto out;
6575
6576                 leaf = path->nodes[0];
6577                 nritems = btrfs_header_nritems(leaf);
6578 next:
6579                 if (extent_locked && ret > 0) {
6580                         /*
6581                          * the file extent item was modified by someone
6582                          * before the extent got locked.
6583                          */
6584                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6585                                       lock_end, GFP_NOFS);
6586                         extent_locked = 0;
6587                 }
6588
6589                 if (path->slots[0] >= nritems) {
6590                         if (++nr_scaned > 2)
6591                                 break;
6592
6593                         BUG_ON(extent_locked);
6594                         ret = btrfs_next_leaf(root, path);
6595                         if (ret < 0)
6596                                 goto out;
6597                         if (ret > 0)
6598                                 break;
6599                         leaf = path->nodes[0];
6600                         nritems = btrfs_header_nritems(leaf);
6601                 }
6602
6603                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6604
6605                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6606                         if ((key.objectid > ref_path->owner_objectid) ||
6607                             (key.objectid == ref_path->owner_objectid &&
6608                              key.type > BTRFS_EXTENT_DATA_KEY) ||
6609                             key.offset >= search_end)
6610                                 break;
6611                 }
6612
6613                 if (inode && key.objectid != inode->i_ino) {
6614                         BUG_ON(extent_locked);
6615                         btrfs_release_path(root, path);
6616                         mutex_unlock(&inode->i_mutex);
6617                         iput(inode);
6618                         inode = NULL;
6619                         continue;
6620                 }
6621
6622                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6623                         path->slots[0]++;
6624                         ret = 1;
6625                         goto next;
6626                 }
6627                 fi = btrfs_item_ptr(leaf, path->slots[0],
6628                                     struct btrfs_file_extent_item);
6629                 extent_type = btrfs_file_extent_type(leaf, fi);
6630                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6631                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6632                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6633                      extent_key->objectid)) {
6634                         path->slots[0]++;
6635                         ret = 1;
6636                         goto next;
6637                 }
6638
6639                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6640                 ext_offset = btrfs_file_extent_offset(leaf, fi);
6641
6642                 if (search_end == (u64)-1) {
6643                         search_end = key.offset - ext_offset +
6644                                 btrfs_file_extent_ram_bytes(leaf, fi);
6645                 }
6646
6647                 if (!extent_locked) {
6648                         lock_start = key.offset;
6649                         lock_end = lock_start + num_bytes - 1;
6650                 } else {
6651                         if (lock_start > key.offset ||
6652                             lock_end + 1 < key.offset + num_bytes) {
6653                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6654                                               lock_start, lock_end, GFP_NOFS);
6655                                 extent_locked = 0;
6656                         }
6657                 }
6658
6659                 if (!inode) {
6660                         btrfs_release_path(root, path);
6661
6662                         inode = btrfs_iget_locked(root->fs_info->sb,
6663                                                   key.objectid, root);
6664                         if (inode->i_state & I_NEW) {
6665                                 BTRFS_I(inode)->root = root;
6666                                 BTRFS_I(inode)->location.objectid =
6667                                         key.objectid;
6668                                 BTRFS_I(inode)->location.type =
6669                                         BTRFS_INODE_ITEM_KEY;
6670                                 BTRFS_I(inode)->location.offset = 0;
6671                                 btrfs_read_locked_inode(inode);
6672                                 unlock_new_inode(inode);
6673                         }
6674                         /*
6675                          * some code call btrfs_commit_transaction while
6676                          * holding the i_mutex, so we can't use mutex_lock
6677                          * here.
6678                          */
6679                         if (is_bad_inode(inode) ||
6680                             !mutex_trylock(&inode->i_mutex)) {
6681                                 iput(inode);
6682                                 inode = NULL;
6683                                 key.offset = (u64)-1;
6684                                 goto skip;
6685                         }
6686                 }
6687
6688                 if (!extent_locked) {
6689                         struct btrfs_ordered_extent *ordered;
6690
6691                         btrfs_release_path(root, path);
6692
6693                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6694                                     lock_end, GFP_NOFS);
6695                         ordered = btrfs_lookup_first_ordered_extent(inode,
6696                                                                     lock_end);
6697                         if (ordered &&
6698                             ordered->file_offset <= lock_end &&
6699                             ordered->file_offset + ordered->len > lock_start) {
6700                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6701                                               lock_start, lock_end, GFP_NOFS);
6702                                 btrfs_start_ordered_extent(inode, ordered, 1);
6703                                 btrfs_put_ordered_extent(ordered);
6704                                 key.offset += num_bytes;
6705                                 goto skip;
6706                         }
6707                         if (ordered)
6708                                 btrfs_put_ordered_extent(ordered);
6709
6710                         extent_locked = 1;
6711                         continue;
6712                 }
6713
6714                 if (nr_extents == 1) {
6715                         /* update extent pointer in place */
6716                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
6717                                                 new_extents[0].disk_bytenr);
6718                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6719                                                 new_extents[0].disk_num_bytes);
6720                         btrfs_mark_buffer_dirty(leaf);
6721
6722                         btrfs_drop_extent_cache(inode, key.offset,
6723                                                 key.offset + num_bytes - 1, 0);
6724
6725                         ret = btrfs_inc_extent_ref(trans, root,
6726                                                 new_extents[0].disk_bytenr,
6727                                                 new_extents[0].disk_num_bytes,
6728                                                 leaf->start,
6729                                                 root->root_key.objectid,
6730                                                 trans->transid,
6731                                                 key.objectid);
6732                         BUG_ON(ret);
6733
6734                         ret = btrfs_free_extent(trans, root,
6735                                                 extent_key->objectid,
6736                                                 extent_key->offset,
6737                                                 leaf->start,
6738                                                 btrfs_header_owner(leaf),
6739                                                 btrfs_header_generation(leaf),
6740                                                 key.objectid, 0);
6741                         BUG_ON(ret);
6742
6743                         btrfs_release_path(root, path);
6744                         key.offset += num_bytes;
6745                 } else {
6746                         BUG_ON(1);
6747 #if 0
6748                         u64 alloc_hint;
6749                         u64 extent_len;
6750                         int i;
6751                         /*
6752                          * drop old extent pointer at first, then insert the
6753                          * new pointers one bye one
6754                          */
6755                         btrfs_release_path(root, path);
6756                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
6757                                                  key.offset + num_bytes,
6758                                                  key.offset, &alloc_hint);
6759                         BUG_ON(ret);
6760
6761                         for (i = 0; i < nr_extents; i++) {
6762                                 if (ext_offset >= new_extents[i].num_bytes) {
6763                                         ext_offset -= new_extents[i].num_bytes;
6764                                         continue;
6765                                 }
6766                                 extent_len = min(new_extents[i].num_bytes -
6767                                                  ext_offset, num_bytes);
6768
6769                                 ret = btrfs_insert_empty_item(trans, root,
6770                                                               path, &key,
6771                                                               sizeof(*fi));
6772                                 BUG_ON(ret);
6773
6774                                 leaf = path->nodes[0];
6775                                 fi = btrfs_item_ptr(leaf, path->slots[0],
6776                                                 struct btrfs_file_extent_item);
6777                                 btrfs_set_file_extent_generation(leaf, fi,
6778                                                         trans->transid);
6779                                 btrfs_set_file_extent_type(leaf, fi,
6780                                                         BTRFS_FILE_EXTENT_REG);
6781                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6782                                                 new_extents[i].disk_bytenr);
6783                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6784                                                 new_extents[i].disk_num_bytes);
6785                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
6786                                                 new_extents[i].ram_bytes);
6787
6788                                 btrfs_set_file_extent_compression(leaf, fi,
6789                                                 new_extents[i].compression);
6790                                 btrfs_set_file_extent_encryption(leaf, fi,
6791                                                 new_extents[i].encryption);
6792                                 btrfs_set_file_extent_other_encoding(leaf, fi,
6793                                                 new_extents[i].other_encoding);
6794
6795                                 btrfs_set_file_extent_num_bytes(leaf, fi,
6796                                                         extent_len);
6797                                 ext_offset += new_extents[i].offset;
6798                                 btrfs_set_file_extent_offset(leaf, fi,
6799                                                         ext_offset);
6800                                 btrfs_mark_buffer_dirty(leaf);
6801
6802                                 btrfs_drop_extent_cache(inode, key.offset,
6803                                                 key.offset + extent_len - 1, 0);
6804
6805                                 ret = btrfs_inc_extent_ref(trans, root,
6806                                                 new_extents[i].disk_bytenr,
6807                                                 new_extents[i].disk_num_bytes,
6808                                                 leaf->start,
6809                                                 root->root_key.objectid,
6810                                                 trans->transid, key.objectid);
6811                                 BUG_ON(ret);
6812                                 btrfs_release_path(root, path);
6813
6814                                 inode_add_bytes(inode, extent_len);
6815
6816                                 ext_offset = 0;
6817                                 num_bytes -= extent_len;
6818                                 key.offset += extent_len;
6819
6820                                 if (num_bytes == 0)
6821                                         break;
6822                         }
6823                         BUG_ON(i >= nr_extents);
6824 #endif
6825                 }
6826
6827                 if (extent_locked) {
6828                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6829                                       lock_end, GFP_NOFS);
6830                         extent_locked = 0;
6831                 }
6832 skip:
6833                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6834                     key.offset >= search_end)
6835                         break;
6836
6837                 cond_resched();
6838         }
6839         ret = 0;
6840 out:
6841         btrfs_release_path(root, path);
6842         if (inode) {
6843                 mutex_unlock(&inode->i_mutex);
6844                 if (extent_locked) {
6845                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6846                                       lock_end, GFP_NOFS);
6847                 }
6848                 iput(inode);
6849         }
6850         return ret;
6851 }
6852
6853 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6854                                struct btrfs_root *root,
6855                                struct extent_buffer *buf, u64 orig_start)
6856 {
6857         int level;
6858         int ret;
6859
6860         BUG_ON(btrfs_header_generation(buf) != trans->transid);
6861         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6862
6863         level = btrfs_header_level(buf);
6864         if (level == 0) {
6865                 struct btrfs_leaf_ref *ref;
6866                 struct btrfs_leaf_ref *orig_ref;
6867
6868                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6869                 if (!orig_ref)
6870                         return -ENOENT;
6871
6872                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6873                 if (!ref) {
6874                         btrfs_free_leaf_ref(root, orig_ref);
6875                         return -ENOMEM;
6876                 }
6877
6878                 ref->nritems = orig_ref->nritems;
6879                 memcpy(ref->extents, orig_ref->extents,
6880                         sizeof(ref->extents[0]) * ref->nritems);
6881
6882                 btrfs_free_leaf_ref(root, orig_ref);
6883
6884                 ref->root_gen = trans->transid;
6885                 ref->bytenr = buf->start;
6886                 ref->owner = btrfs_header_owner(buf);
6887                 ref->generation = btrfs_header_generation(buf);
6888
6889                 ret = btrfs_add_leaf_ref(root, ref, 0);
6890                 WARN_ON(ret);
6891                 btrfs_free_leaf_ref(root, ref);
6892         }
6893         return 0;
6894 }
6895
6896 static noinline int invalidate_extent_cache(struct btrfs_root *root,
6897                                         struct extent_buffer *leaf,
6898                                         struct btrfs_block_group_cache *group,
6899                                         struct btrfs_root *target_root)
6900 {
6901         struct btrfs_key key;
6902         struct inode *inode = NULL;
6903         struct btrfs_file_extent_item *fi;
6904         struct extent_state *cached_state = NULL;
6905         u64 num_bytes;
6906         u64 skip_objectid = 0;
6907         u32 nritems;
6908         u32 i;
6909
6910         nritems = btrfs_header_nritems(leaf);
6911         for (i = 0; i < nritems; i++) {
6912                 btrfs_item_key_to_cpu(leaf, &key, i);
6913                 if (key.objectid == skip_objectid ||
6914                     key.type != BTRFS_EXTENT_DATA_KEY)
6915                         continue;
6916                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6917                 if (btrfs_file_extent_type(leaf, fi) ==
6918                     BTRFS_FILE_EXTENT_INLINE)
6919                         continue;
6920                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6921                         continue;
6922                 if (!inode || inode->i_ino != key.objectid) {
6923                         iput(inode);
6924                         inode = btrfs_ilookup(target_root->fs_info->sb,
6925                                               key.objectid, target_root, 1);
6926                 }
6927                 if (!inode) {
6928                         skip_objectid = key.objectid;
6929                         continue;
6930                 }
6931                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6932
6933                 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6934                                  key.offset + num_bytes - 1, 0, &cached_state,
6935                                  GFP_NOFS);
6936                 btrfs_drop_extent_cache(inode, key.offset,
6937                                         key.offset + num_bytes - 1, 1);
6938                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6939                                      key.offset + num_bytes - 1, &cached_state,
6940                                      GFP_NOFS);
6941                 cond_resched();
6942         }
6943         iput(inode);
6944         return 0;
6945 }
6946
6947 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
6948                                         struct btrfs_root *root,
6949                                         struct extent_buffer *leaf,
6950                                         struct btrfs_block_group_cache *group,
6951                                         struct inode *reloc_inode)
6952 {
6953         struct btrfs_key key;
6954         struct btrfs_key extent_key;
6955         struct btrfs_file_extent_item *fi;
6956         struct btrfs_leaf_ref *ref;
6957         struct disk_extent *new_extent;
6958         u64 bytenr;
6959         u64 num_bytes;
6960         u32 nritems;
6961         u32 i;
6962         int ext_index;
6963         int nr_extent;
6964         int ret;
6965
6966         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
6967         BUG_ON(!new_extent);
6968
6969         ref = btrfs_lookup_leaf_ref(root, leaf->start);
6970         BUG_ON(!ref);
6971
6972         ext_index = -1;
6973         nritems = btrfs_header_nritems(leaf);
6974         for (i = 0; i < nritems; i++) {
6975                 btrfs_item_key_to_cpu(leaf, &key, i);
6976                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6977                         continue;
6978                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6979                 if (btrfs_file_extent_type(leaf, fi) ==
6980                     BTRFS_FILE_EXTENT_INLINE)
6981                         continue;
6982                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6983                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6984                 if (bytenr == 0)
6985                         continue;
6986
6987                 ext_index++;
6988                 if (bytenr >= group->key.objectid + group->key.offset ||
6989                     bytenr + num_bytes <= group->key.objectid)
6990                         continue;
6991
6992                 extent_key.objectid = bytenr;
6993                 extent_key.offset = num_bytes;
6994                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6995                 nr_extent = 1;
6996                 ret = get_new_locations(reloc_inode, &extent_key,
6997                                         group->key.objectid, 1,
6998                                         &new_extent, &nr_extent);
6999                 if (ret > 0)
7000                         continue;
7001                 BUG_ON(ret < 0);
7002
7003                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
7004                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
7005                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
7006                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
7007
7008                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
7009                                                 new_extent->disk_bytenr);
7010                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7011                                                 new_extent->disk_num_bytes);
7012                 btrfs_mark_buffer_dirty(leaf);
7013
7014                 ret = btrfs_inc_extent_ref(trans, root,
7015                                         new_extent->disk_bytenr,
7016                                         new_extent->disk_num_bytes,
7017                                         leaf->start,
7018                                         root->root_key.objectid,
7019                                         trans->transid, key.objectid);
7020                 BUG_ON(ret);
7021
7022                 ret = btrfs_free_extent(trans, root,
7023                                         bytenr, num_bytes, leaf->start,
7024                                         btrfs_header_owner(leaf),
7025                                         btrfs_header_generation(leaf),
7026                                         key.objectid, 0);
7027                 BUG_ON(ret);
7028                 cond_resched();
7029         }
7030         kfree(new_extent);
7031         BUG_ON(ext_index + 1 != ref->nritems);
7032         btrfs_free_leaf_ref(root, ref);
7033         return 0;
7034 }
7035
7036 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
7037                           struct btrfs_root *root)
7038 {
7039         struct btrfs_root *reloc_root;
7040         int ret;
7041
7042         if (root->reloc_root) {
7043                 reloc_root = root->reloc_root;
7044                 root->reloc_root = NULL;
7045                 list_add(&reloc_root->dead_list,
7046                          &root->fs_info->dead_reloc_roots);
7047
7048                 btrfs_set_root_bytenr(&reloc_root->root_item,
7049                                       reloc_root->node->start);
7050                 btrfs_set_root_level(&root->root_item,
7051                                      btrfs_header_level(reloc_root->node));
7052                 memset(&reloc_root->root_item.drop_progress, 0,
7053                         sizeof(struct btrfs_disk_key));
7054                 reloc_root->root_item.drop_level = 0;
7055
7056                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
7057                                         &reloc_root->root_key,
7058                                         &reloc_root->root_item);
7059                 BUG_ON(ret);
7060         }
7061         return 0;
7062 }
7063
7064 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7065 {
7066         struct btrfs_trans_handle *trans;
7067         struct btrfs_root *reloc_root;
7068         struct btrfs_root *prev_root = NULL;
7069         struct list_head dead_roots;
7070         int ret;
7071         unsigned long nr;
7072
7073         INIT_LIST_HEAD(&dead_roots);
7074         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
7075
7076         while (!list_empty(&dead_roots)) {
7077                 reloc_root = list_entry(dead_roots.prev,
7078                                         struct btrfs_root, dead_list);
7079                 list_del_init(&reloc_root->dead_list);
7080
7081                 BUG_ON(reloc_root->commit_root != NULL);
7082                 while (1) {
7083                         trans = btrfs_join_transaction(root, 1);
7084                         BUG_ON(!trans);
7085
7086                         mutex_lock(&root->fs_info->drop_mutex);
7087                         ret = btrfs_drop_snapshot(trans, reloc_root);
7088                         if (ret != -EAGAIN)
7089                                 break;
7090                         mutex_unlock(&root->fs_info->drop_mutex);
7091
7092                         nr = trans->blocks_used;
7093                         ret = btrfs_end_transaction(trans, root);
7094                         BUG_ON(ret);
7095                         btrfs_btree_balance_dirty(root, nr);
7096                 }
7097
7098                 free_extent_buffer(reloc_root->node);
7099
7100                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
7101                                      &reloc_root->root_key);
7102                 BUG_ON(ret);
7103                 mutex_unlock(&root->fs_info->drop_mutex);
7104
7105                 nr = trans->blocks_used;
7106                 ret = btrfs_end_transaction(trans, root);
7107                 BUG_ON(ret);
7108                 btrfs_btree_balance_dirty(root, nr);
7109
7110                 kfree(prev_root);
7111                 prev_root = reloc_root;
7112         }
7113         if (prev_root) {
7114                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
7115                 kfree(prev_root);
7116         }
7117         return 0;
7118 }
7119
7120 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
7121 {
7122         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
7123         return 0;
7124 }
7125
7126 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7127 {
7128         struct btrfs_root *reloc_root;
7129         struct btrfs_trans_handle *trans;
7130         struct btrfs_key location;
7131         int found;
7132         int ret;
7133
7134         mutex_lock(&root->fs_info->tree_reloc_mutex);
7135         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
7136         BUG_ON(ret);
7137         found = !list_empty(&root->fs_info->dead_reloc_roots);
7138         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7139
7140         if (found) {
7141                 trans = btrfs_start_transaction(root, 1);
7142                 BUG_ON(!trans);
7143                 ret = btrfs_commit_transaction(trans, root);
7144                 BUG_ON(ret);
7145         }
7146
7147         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7148         location.offset = (u64)-1;
7149         location.type = BTRFS_ROOT_ITEM_KEY;
7150
7151         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7152         BUG_ON(!reloc_root);
7153         btrfs_orphan_cleanup(reloc_root);
7154         return 0;
7155 }
7156
7157 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7158                                     struct btrfs_root *root)
7159 {
7160         struct btrfs_root *reloc_root;
7161         struct extent_buffer *eb;
7162         struct btrfs_root_item *root_item;
7163         struct btrfs_key root_key;
7164         int ret;
7165
7166         BUG_ON(!root->ref_cows);
7167         if (root->reloc_root)
7168                 return 0;
7169
7170         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7171         BUG_ON(!root_item);
7172
7173         ret = btrfs_copy_root(trans, root, root->commit_root,
7174                               &eb, BTRFS_TREE_RELOC_OBJECTID);
7175         BUG_ON(ret);
7176
7177         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7178         root_key.offset = root->root_key.objectid;
7179         root_key.type = BTRFS_ROOT_ITEM_KEY;
7180
7181         memcpy(root_item, &root->root_item, sizeof(root_item));
7182         btrfs_set_root_refs(root_item, 0);
7183         btrfs_set_root_bytenr(root_item, eb->start);
7184         btrfs_set_root_level(root_item, btrfs_header_level(eb));
7185         btrfs_set_root_generation(root_item, trans->transid);
7186
7187         btrfs_tree_unlock(eb);
7188         free_extent_buffer(eb);
7189
7190         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
7191                                 &root_key, root_item);
7192         BUG_ON(ret);
7193         kfree(root_item);
7194
7195         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7196                                                  &root_key);
7197         BUG_ON(!reloc_root);
7198         reloc_root->last_trans = trans->transid;
7199         reloc_root->commit_root = NULL;
7200         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
7201
7202         root->reloc_root = reloc_root;
7203         return 0;
7204 }
7205
7206 /*
7207  * Core function of space balance.
7208  *
7209  * The idea is using reloc trees to relocate tree blocks in reference
7210  * counted roots. There is one reloc tree for each subvol, and all
7211  * reloc trees share same root key objectid. Reloc trees are snapshots
7212  * of the latest committed roots of subvols (root->commit_root).
7213  *
7214  * To relocate a tree block referenced by a subvol, there are two steps.
7215  * COW the block through subvol's reloc tree, then update block pointer
7216  * in the subvol to point to the new block. Since all reloc trees share
7217  * same root key objectid, doing special handing for tree blocks owned
7218  * by them is easy. Once a tree block has been COWed in one reloc tree,
7219  * we can use the resulting new block directly when the same block is
7220  * required to COW again through other reloc trees. By this way, relocated
7221  * tree blocks are shared between reloc trees, so they are also shared
7222  * between subvols.
7223  */
7224 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
7225                                       struct btrfs_root *root,
7226                                       struct btrfs_path *path,
7227                                       struct btrfs_key *first_key,
7228                                       struct btrfs_ref_path *ref_path,
7229                                       struct btrfs_block_group_cache *group,
7230                                       struct inode *reloc_inode)
7231 {
7232         struct btrfs_root *reloc_root;
7233         struct extent_buffer *eb = NULL;
7234         struct btrfs_key *keys;
7235         u64 *nodes;
7236         int level;
7237         int shared_level;
7238         int lowest_level = 0;
7239         int ret;
7240
7241         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
7242                 lowest_level = ref_path->owner_objectid;
7243
7244         if (!root->ref_cows) {
7245                 path->lowest_level = lowest_level;
7246                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
7247                 BUG_ON(ret < 0);
7248                 path->lowest_level = 0;
7249                 btrfs_release_path(root, path);
7250                 return 0;
7251         }
7252
7253         mutex_lock(&root->fs_info->tree_reloc_mutex);
7254         ret = init_reloc_tree(trans, root);
7255         BUG_ON(ret);
7256         reloc_root = root->reloc_root;
7257
7258         shared_level = ref_path->shared_level;
7259         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
7260
7261         keys = ref_path->node_keys;
7262         nodes = ref_path->new_nodes;
7263         memset(&keys[shared_level + 1], 0,
7264                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7265         memset(&nodes[shared_level + 1], 0,
7266                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
7267
7268         if (nodes[lowest_level] == 0) {
7269                 path->lowest_level = lowest_level;
7270                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7271                                         0, 1);
7272                 BUG_ON(ret);
7273                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7274                         eb = path->nodes[level];
7275                         if (!eb || eb == reloc_root->node)
7276                                 break;
7277                         nodes[level] = eb->start;
7278                         if (level == 0)
7279                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
7280                         else
7281                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
7282                 }
7283                 if (nodes[0] &&
7284                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7285                         eb = path->nodes[0];
7286                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
7287                                                       group, reloc_inode);
7288                         BUG_ON(ret);
7289                 }
7290                 btrfs_release_path(reloc_root, path);
7291         } else {
7292                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
7293                                        lowest_level);
7294                 BUG_ON(ret);
7295         }
7296
7297         /*
7298          * replace tree blocks in the fs tree with tree blocks in
7299          * the reloc tree.
7300          */
7301         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7302         BUG_ON(ret < 0);
7303
7304         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7305                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7306                                         0, 0);
7307                 BUG_ON(ret);
7308                 extent_buffer_get(path->nodes[0]);
7309                 eb = path->nodes[0];
7310                 btrfs_release_path(reloc_root, path);
7311                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
7312                 BUG_ON(ret);
7313                 free_extent_buffer(eb);
7314         }
7315
7316         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7317         path->lowest_level = 0;
7318         return 0;
7319 }
7320
7321 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
7322                                         struct btrfs_root *root,
7323                                         struct btrfs_path *path,
7324                                         struct btrfs_key *first_key,
7325                                         struct btrfs_ref_path *ref_path)
7326 {
7327         int ret;
7328
7329         ret = relocate_one_path(trans, root, path, first_key,
7330                                 ref_path, NULL, NULL);
7331         BUG_ON(ret);
7332
7333         return 0;
7334 }
7335
7336 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
7337                                     struct btrfs_root *extent_root,
7338                                     struct btrfs_path *path,
7339                                     struct btrfs_key *extent_key)
7340 {
7341         int ret;
7342
7343         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7344         if (ret)
7345                 goto out;
7346         ret = btrfs_del_item(trans, extent_root, path);
7347 out:
7348         btrfs_release_path(extent_root, path);
7349         return ret;
7350 }
7351
7352 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7353                                                 struct btrfs_ref_path *ref_path)
7354 {
7355         struct btrfs_key root_key;
7356
7357         root_key.objectid = ref_path->root_objectid;
7358         root_key.type = BTRFS_ROOT_ITEM_KEY;
7359         if (is_cowonly_root(ref_path->root_objectid))
7360                 root_key.offset = 0;
7361         else
7362                 root_key.offset = (u64)-1;
7363
7364         return btrfs_read_fs_root_no_name(fs_info, &root_key);
7365 }
7366
7367 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7368                                         struct btrfs_path *path,
7369                                         struct btrfs_key *extent_key,
7370                                         struct btrfs_block_group_cache *group,
7371                                         struct inode *reloc_inode, int pass)
7372 {
7373         struct btrfs_trans_handle *trans;
7374         struct btrfs_root *found_root;
7375         struct btrfs_ref_path *ref_path = NULL;
7376         struct disk_extent *new_extents = NULL;
7377         int nr_extents = 0;
7378         int loops;
7379         int ret;
7380         int level;
7381         struct btrfs_key first_key;
7382         u64 prev_block = 0;
7383
7384
7385         trans = btrfs_start_transaction(extent_root, 1);
7386         BUG_ON(!trans);
7387
7388         if (extent_key->objectid == 0) {
7389                 ret = del_extent_zero(trans, extent_root, path, extent_key);
7390                 goto out;
7391         }
7392
7393         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7394         if (!ref_path) {
7395                 ret = -ENOMEM;
7396                 goto out;
7397         }
7398
7399         for (loops = 0; ; loops++) {
7400                 if (loops == 0) {
7401                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7402                                                    extent_key->objectid);
7403                 } else {
7404                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7405                 }
7406                 if (ret < 0)
7407                         goto out;
7408                 if (ret > 0)
7409                         break;
7410
7411                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7412                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7413                         continue;
7414
7415                 found_root = read_ref_root(extent_root->fs_info, ref_path);
7416                 BUG_ON(!found_root);
7417                 /*
7418                  * for reference counted tree, only process reference paths
7419                  * rooted at the latest committed root.
7420                  */
7421                 if (found_root->ref_cows &&
7422                     ref_path->root_generation != found_root->root_key.offset)
7423                         continue;
7424
7425                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7426                         if (pass == 0) {
7427                                 /*
7428                                  * copy data extents to new locations
7429                                  */
7430                                 u64 group_start = group->key.objectid;
7431                                 ret = relocate_data_extent(reloc_inode,
7432                                                            extent_key,
7433                                                            group_start);
7434                                 if (ret < 0)
7435                                         goto out;
7436                                 break;
7437                         }
7438                         level = 0;
7439                 } else {
7440                         level = ref_path->owner_objectid;
7441                 }
7442
7443                 if (prev_block != ref_path->nodes[level]) {
7444                         struct extent_buffer *eb;
7445                         u64 block_start = ref_path->nodes[level];
7446                         u64 block_size = btrfs_level_size(found_root, level);
7447
7448                         eb = read_tree_block(found_root, block_start,
7449                                              block_size, 0);
7450                         btrfs_tree_lock(eb);
7451                         BUG_ON(level != btrfs_header_level(eb));
7452
7453                         if (level == 0)
7454                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
7455                         else
7456                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
7457
7458                         btrfs_tree_unlock(eb);
7459                         free_extent_buffer(eb);
7460                         prev_block = block_start;
7461                 }
7462
7463                 mutex_lock(&extent_root->fs_info->trans_mutex);
7464                 btrfs_record_root_in_trans(found_root);
7465                 mutex_unlock(&extent_root->fs_info->trans_mutex);
7466                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7467                         /*
7468                          * try to update data extent references while
7469                          * keeping metadata shared between snapshots.
7470                          */
7471                         if (pass == 1) {
7472                                 ret = relocate_one_path(trans, found_root,
7473                                                 path, &first_key, ref_path,
7474                                                 group, reloc_inode);
7475                                 if (ret < 0)
7476                                         goto out;
7477                                 continue;
7478                         }
7479                         /*
7480                          * use fallback method to process the remaining
7481                          * references.
7482                          */
7483                         if (!new_extents) {
7484                                 u64 group_start = group->key.objectid;
7485                                 new_extents = kmalloc(sizeof(*new_extents),
7486                                                       GFP_NOFS);
7487                                 nr_extents = 1;
7488                                 ret = get_new_locations(reloc_inode,
7489                                                         extent_key,
7490                                                         group_start, 1,
7491                                                         &new_extents,
7492                                                         &nr_extents);
7493                                 if (ret)
7494                                         goto out;
7495                         }
7496                         ret = replace_one_extent(trans, found_root,
7497                                                 path, extent_key,
7498                                                 &first_key, ref_path,
7499                                                 new_extents, nr_extents);
7500                 } else {
7501                         ret = relocate_tree_block(trans, found_root, path,
7502                                                   &first_key, ref_path);
7503                 }
7504                 if (ret < 0)
7505                         goto out;
7506         }
7507         ret = 0;
7508 out:
7509         btrfs_end_transaction(trans, extent_root);
7510         kfree(new_extents);
7511         kfree(ref_path);
7512         return ret;
7513 }
7514 #endif
7515
7516 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7517 {
7518         u64 num_devices;
7519         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7520                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7521
7522         num_devices = root->fs_info->fs_devices->rw_devices;
7523         if (num_devices == 1) {
7524                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7525                 stripped = flags & ~stripped;
7526
7527                 /* turn raid0 into single device chunks */
7528                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7529                         return stripped;
7530
7531                 /* turn mirroring into duplication */
7532                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7533                              BTRFS_BLOCK_GROUP_RAID10))
7534                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7535                 return flags;
7536         } else {
7537                 /* they already had raid on here, just return */
7538                 if (flags & stripped)
7539                         return flags;
7540
7541                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7542                 stripped = flags & ~stripped;
7543
7544                 /* switch duplicated blocks with raid1 */
7545                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7546                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7547
7548                 /* turn single device chunks into raid0 */
7549                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7550         }
7551         return flags;
7552 }
7553
7554 static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7555 {
7556         struct btrfs_space_info *sinfo = cache->space_info;
7557         u64 num_bytes;
7558         int ret = -ENOSPC;
7559
7560         if (cache->ro)
7561                 return 0;
7562
7563         spin_lock(&sinfo->lock);
7564         spin_lock(&cache->lock);
7565         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7566                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7567
7568         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7569             sinfo->bytes_may_use + sinfo->bytes_readonly +
7570             cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7571                 sinfo->bytes_readonly += num_bytes;
7572                 sinfo->bytes_reserved += cache->reserved_pinned;
7573                 cache->reserved_pinned = 0;
7574                 cache->ro = 1;
7575                 ret = 0;
7576         }
7577         spin_unlock(&cache->lock);
7578         spin_unlock(&sinfo->lock);
7579         return ret;
7580 }
7581
7582 int btrfs_set_block_group_ro(struct btrfs_root *root,
7583                              struct btrfs_block_group_cache *cache)
7584
7585 {
7586         struct btrfs_trans_handle *trans;
7587         u64 alloc_flags;
7588         int ret;
7589
7590         BUG_ON(cache->ro);
7591
7592         trans = btrfs_join_transaction(root, 1);
7593         BUG_ON(IS_ERR(trans));
7594
7595         alloc_flags = update_block_group_flags(root, cache->flags);
7596         if (alloc_flags != cache->flags)
7597                 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7598
7599         ret = set_block_group_ro(cache);
7600         if (!ret)
7601                 goto out;
7602         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7603         ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7604         if (ret < 0)
7605                 goto out;
7606         ret = set_block_group_ro(cache);
7607 out:
7608         btrfs_end_transaction(trans, root);
7609         return ret;
7610 }
7611
7612 int btrfs_set_block_group_rw(struct btrfs_root *root,
7613                               struct btrfs_block_group_cache *cache)
7614 {
7615         struct btrfs_space_info *sinfo = cache->space_info;
7616         u64 num_bytes;
7617
7618         BUG_ON(!cache->ro);
7619
7620         spin_lock(&sinfo->lock);
7621         spin_lock(&cache->lock);
7622         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7623                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7624         sinfo->bytes_readonly -= num_bytes;
7625         cache->ro = 0;
7626         spin_unlock(&cache->lock);
7627         spin_unlock(&sinfo->lock);
7628         return 0;
7629 }
7630
7631 /*
7632  * checks to see if its even possible to relocate this block group.
7633  *
7634  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7635  * ok to go ahead and try.
7636  */
7637 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7638 {
7639         struct btrfs_block_group_cache *block_group;
7640         struct btrfs_space_info *space_info;
7641         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7642         struct btrfs_device *device;
7643         int full = 0;
7644         int ret = 0;
7645
7646         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7647
7648         /* odd, couldn't find the block group, leave it alone */
7649         if (!block_group)
7650                 return -1;
7651
7652         /* no bytes used, we're good */
7653         if (!btrfs_block_group_used(&block_group->item))
7654                 goto out;
7655
7656         space_info = block_group->space_info;
7657         spin_lock(&space_info->lock);
7658
7659         full = space_info->full;
7660
7661         /*
7662          * if this is the last block group we have in this space, we can't
7663          * relocate it unless we're able to allocate a new chunk below.
7664          *
7665          * Otherwise, we need to make sure we have room in the space to handle
7666          * all of the extents from this block group.  If we can, we're good
7667          */
7668         if ((space_info->total_bytes != block_group->key.offset) &&
7669            (space_info->bytes_used + space_info->bytes_reserved +
7670             space_info->bytes_pinned + space_info->bytes_readonly +
7671             btrfs_block_group_used(&block_group->item) <
7672             space_info->total_bytes)) {
7673                 spin_unlock(&space_info->lock);
7674                 goto out;
7675         }
7676         spin_unlock(&space_info->lock);
7677
7678         /*
7679          * ok we don't have enough space, but maybe we have free space on our
7680          * devices to allocate new chunks for relocation, so loop through our
7681          * alloc devices and guess if we have enough space.  However, if we
7682          * were marked as full, then we know there aren't enough chunks, and we
7683          * can just return.
7684          */
7685         ret = -1;
7686         if (full)
7687                 goto out;
7688
7689         mutex_lock(&root->fs_info->chunk_mutex);
7690         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7691                 u64 min_free = btrfs_block_group_used(&block_group->item);
7692                 u64 dev_offset, max_avail;
7693
7694                 /*
7695                  * check to make sure we can actually find a chunk with enough
7696                  * space to fit our block group in.
7697                  */
7698                 if (device->total_bytes > device->bytes_used + min_free) {
7699                         ret = find_free_dev_extent(NULL, device, min_free,
7700                                                    &dev_offset, &max_avail);
7701                         if (!ret)
7702                                 break;
7703                         ret = -1;
7704                 }
7705         }
7706         mutex_unlock(&root->fs_info->chunk_mutex);
7707 out:
7708         btrfs_put_block_group(block_group);
7709         return ret;
7710 }
7711
7712 static int find_first_block_group(struct btrfs_root *root,
7713                 struct btrfs_path *path, struct btrfs_key *key)
7714 {
7715         int ret = 0;
7716         struct btrfs_key found_key;
7717         struct extent_buffer *leaf;
7718         int slot;
7719
7720         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7721         if (ret < 0)
7722                 goto out;
7723
7724         while (1) {
7725                 slot = path->slots[0];
7726                 leaf = path->nodes[0];
7727                 if (slot >= btrfs_header_nritems(leaf)) {
7728                         ret = btrfs_next_leaf(root, path);
7729                         if (ret == 0)
7730                                 continue;
7731                         if (ret < 0)
7732                                 goto out;
7733                         break;
7734                 }
7735                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7736
7737                 if (found_key.objectid >= key->objectid &&
7738                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7739                         ret = 0;
7740                         goto out;
7741                 }
7742                 path->slots[0]++;
7743         }
7744 out:
7745         return ret;
7746 }
7747
7748 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7749 {
7750         struct btrfs_block_group_cache *block_group;
7751         struct btrfs_space_info *space_info;
7752         struct btrfs_caching_control *caching_ctl;
7753         struct rb_node *n;
7754
7755         down_write(&info->extent_commit_sem);
7756         while (!list_empty(&info->caching_block_groups)) {
7757                 caching_ctl = list_entry(info->caching_block_groups.next,
7758                                          struct btrfs_caching_control, list);
7759                 list_del(&caching_ctl->list);
7760                 put_caching_control(caching_ctl);
7761         }
7762         up_write(&info->extent_commit_sem);
7763
7764         spin_lock(&info->block_group_cache_lock);
7765         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7766                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7767                                        cache_node);
7768                 rb_erase(&block_group->cache_node,
7769                          &info->block_group_cache_tree);
7770                 spin_unlock(&info->block_group_cache_lock);
7771
7772                 down_write(&block_group->space_info->groups_sem);
7773                 list_del(&block_group->list);
7774                 up_write(&block_group->space_info->groups_sem);
7775
7776                 if (block_group->cached == BTRFS_CACHE_STARTED)
7777                         wait_block_group_cache_done(block_group);
7778
7779                 btrfs_remove_free_space_cache(block_group);
7780                 btrfs_put_block_group(block_group);
7781
7782                 spin_lock(&info->block_group_cache_lock);
7783         }
7784         spin_unlock(&info->block_group_cache_lock);
7785
7786         /* now that all the block groups are freed, go through and
7787          * free all the space_info structs.  This is only called during
7788          * the final stages of unmount, and so we know nobody is
7789          * using them.  We call synchronize_rcu() once before we start,
7790          * just to be on the safe side.
7791          */
7792         synchronize_rcu();
7793
7794         while(!list_empty(&info->space_info)) {
7795                 space_info = list_entry(info->space_info.next,
7796                                         struct btrfs_space_info,
7797                                         list);
7798                 if (space_info->bytes_pinned > 0 ||
7799                     space_info->bytes_reserved > 0) {
7800                         WARN_ON(1);
7801                         dump_space_info(space_info, 0, 0);
7802                 }
7803                 list_del(&space_info->list);
7804                 kfree(space_info);
7805         }
7806         return 0;
7807 }
7808
7809 static void __link_block_group(struct btrfs_space_info *space_info,
7810                                struct btrfs_block_group_cache *cache)
7811 {
7812         int index = get_block_group_index(cache);
7813
7814         down_write(&space_info->groups_sem);
7815         list_add_tail(&cache->list, &space_info->block_groups[index]);
7816         up_write(&space_info->groups_sem);
7817 }
7818
7819 int btrfs_read_block_groups(struct btrfs_root *root)
7820 {
7821         struct btrfs_path *path;
7822         int ret;
7823         struct btrfs_block_group_cache *cache;
7824         struct btrfs_fs_info *info = root->fs_info;
7825         struct btrfs_space_info *space_info;
7826         struct btrfs_key key;
7827         struct btrfs_key found_key;
7828         struct extent_buffer *leaf;
7829
7830         root = info->extent_root;
7831         key.objectid = 0;
7832         key.offset = 0;
7833         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7834         path = btrfs_alloc_path();
7835         if (!path)
7836                 return -ENOMEM;
7837
7838         while (1) {
7839                 ret = find_first_block_group(root, path, &key);
7840                 if (ret > 0)
7841                         break;
7842                 if (ret != 0)
7843                         goto error;
7844
7845                 leaf = path->nodes[0];
7846                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7847                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7848                 if (!cache) {
7849                         ret = -ENOMEM;
7850                         goto error;
7851                 }
7852
7853                 atomic_set(&cache->count, 1);
7854                 spin_lock_init(&cache->lock);
7855                 spin_lock_init(&cache->tree_lock);
7856                 cache->fs_info = info;
7857                 INIT_LIST_HEAD(&cache->list);
7858                 INIT_LIST_HEAD(&cache->cluster_list);
7859
7860                 /*
7861                  * we only want to have 32k of ram per block group for keeping
7862                  * track of free space, and if we pass 1/2 of that we want to
7863                  * start converting things over to using bitmaps
7864                  */
7865                 cache->extents_thresh = ((1024 * 32) / 2) /
7866                         sizeof(struct btrfs_free_space);
7867
7868                 read_extent_buffer(leaf, &cache->item,
7869                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7870                                    sizeof(cache->item));
7871                 memcpy(&cache->key, &found_key, sizeof(found_key));
7872
7873                 key.objectid = found_key.objectid + found_key.offset;
7874                 btrfs_release_path(root, path);
7875                 cache->flags = btrfs_block_group_flags(&cache->item);
7876                 cache->sectorsize = root->sectorsize;
7877
7878                 /*
7879                  * check for two cases, either we are full, and therefore
7880                  * don't need to bother with the caching work since we won't
7881                  * find any space, or we are empty, and we can just add all
7882                  * the space in and be done with it.  This saves us _alot_ of
7883                  * time, particularly in the full case.
7884                  */
7885                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7886                         exclude_super_stripes(root, cache);
7887                         cache->last_byte_to_unpin = (u64)-1;
7888                         cache->cached = BTRFS_CACHE_FINISHED;
7889                         free_excluded_extents(root, cache);
7890                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7891                         exclude_super_stripes(root, cache);
7892                         cache->last_byte_to_unpin = (u64)-1;
7893                         cache->cached = BTRFS_CACHE_FINISHED;
7894                         add_new_free_space(cache, root->fs_info,
7895                                            found_key.objectid,
7896                                            found_key.objectid +
7897                                            found_key.offset);
7898                         free_excluded_extents(root, cache);
7899                 }
7900
7901                 ret = update_space_info(info, cache->flags, found_key.offset,
7902                                         btrfs_block_group_used(&cache->item),
7903                                         &space_info);
7904                 BUG_ON(ret);
7905                 cache->space_info = space_info;
7906                 spin_lock(&cache->space_info->lock);
7907                 cache->space_info->bytes_readonly += cache->bytes_super;
7908                 spin_unlock(&cache->space_info->lock);
7909
7910                 __link_block_group(space_info, cache);
7911
7912                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7913                 BUG_ON(ret);
7914
7915                 set_avail_alloc_bits(root->fs_info, cache->flags);
7916                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7917                         set_block_group_ro(cache);
7918         }
7919
7920         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7921                 if (!(get_alloc_profile(root, space_info->flags) &
7922                       (BTRFS_BLOCK_GROUP_RAID10 |
7923                        BTRFS_BLOCK_GROUP_RAID1 |
7924                        BTRFS_BLOCK_GROUP_DUP)))
7925                         continue;
7926                 /*
7927                  * avoid allocating from un-mirrored block group if there are
7928                  * mirrored block groups.
7929                  */
7930                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7931                         set_block_group_ro(cache);
7932                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7933                         set_block_group_ro(cache);
7934         }
7935
7936         init_global_block_rsv(info);
7937         ret = 0;
7938 error:
7939         btrfs_free_path(path);
7940         return ret;
7941 }
7942
7943 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7944                            struct btrfs_root *root, u64 bytes_used,
7945                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7946                            u64 size)
7947 {
7948         int ret;
7949         struct btrfs_root *extent_root;
7950         struct btrfs_block_group_cache *cache;
7951
7952         extent_root = root->fs_info->extent_root;
7953
7954         root->fs_info->last_trans_log_full_commit = trans->transid;
7955
7956         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7957         if (!cache)
7958                 return -ENOMEM;
7959
7960         cache->key.objectid = chunk_offset;
7961         cache->key.offset = size;
7962         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7963         cache->sectorsize = root->sectorsize;
7964
7965         /*
7966          * we only want to have 32k of ram per block group for keeping track
7967          * of free space, and if we pass 1/2 of that we want to start
7968          * converting things over to using bitmaps
7969          */
7970         cache->extents_thresh = ((1024 * 32) / 2) /
7971                 sizeof(struct btrfs_free_space);
7972         atomic_set(&cache->count, 1);
7973         spin_lock_init(&cache->lock);
7974         spin_lock_init(&cache->tree_lock);
7975         INIT_LIST_HEAD(&cache->list);
7976         INIT_LIST_HEAD(&cache->cluster_list);
7977
7978         btrfs_set_block_group_used(&cache->item, bytes_used);
7979         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7980         cache->flags = type;
7981         btrfs_set_block_group_flags(&cache->item, type);
7982
7983         cache->last_byte_to_unpin = (u64)-1;
7984         cache->cached = BTRFS_CACHE_FINISHED;
7985         exclude_super_stripes(root, cache);
7986
7987         add_new_free_space(cache, root->fs_info, chunk_offset,
7988                            chunk_offset + size);
7989
7990         free_excluded_extents(root, cache);
7991
7992         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7993                                 &cache->space_info);
7994         BUG_ON(ret);
7995
7996         spin_lock(&cache->space_info->lock);
7997         cache->space_info->bytes_readonly += cache->bytes_super;
7998         spin_unlock(&cache->space_info->lock);
7999
8000         __link_block_group(cache->space_info, cache);
8001
8002         ret = btrfs_add_block_group_cache(root->fs_info, cache);
8003         BUG_ON(ret);
8004
8005         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
8006                                 sizeof(cache->item));
8007         BUG_ON(ret);
8008
8009         set_avail_alloc_bits(extent_root->fs_info, type);
8010
8011         return 0;
8012 }
8013
8014 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8015                              struct btrfs_root *root, u64 group_start)
8016 {
8017         struct btrfs_path *path;
8018         struct btrfs_block_group_cache *block_group;
8019         struct btrfs_free_cluster *cluster;
8020         struct btrfs_key key;
8021         int ret;
8022
8023         root = root->fs_info->extent_root;
8024
8025         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8026         BUG_ON(!block_group);
8027         BUG_ON(!block_group->ro);
8028
8029         memcpy(&key, &block_group->key, sizeof(key));
8030
8031         /* make sure this block group isn't part of an allocation cluster */
8032         cluster = &root->fs_info->data_alloc_cluster;
8033         spin_lock(&cluster->refill_lock);
8034         btrfs_return_cluster_to_free_space(block_group, cluster);
8035         spin_unlock(&cluster->refill_lock);
8036
8037         /*
8038          * make sure this block group isn't part of a metadata
8039          * allocation cluster
8040          */
8041         cluster = &root->fs_info->meta_alloc_cluster;
8042         spin_lock(&cluster->refill_lock);
8043         btrfs_return_cluster_to_free_space(block_group, cluster);
8044         spin_unlock(&cluster->refill_lock);
8045
8046         path = btrfs_alloc_path();
8047         BUG_ON(!path);
8048
8049         spin_lock(&root->fs_info->block_group_cache_lock);
8050         rb_erase(&block_group->cache_node,
8051                  &root->fs_info->block_group_cache_tree);
8052         spin_unlock(&root->fs_info->block_group_cache_lock);
8053
8054         down_write(&block_group->space_info->groups_sem);
8055         /*
8056          * we must use list_del_init so people can check to see if they
8057          * are still on the list after taking the semaphore
8058          */
8059         list_del_init(&block_group->list);
8060         up_write(&block_group->space_info->groups_sem);
8061
8062         if (block_group->cached == BTRFS_CACHE_STARTED)
8063                 wait_block_group_cache_done(block_group);
8064
8065         btrfs_remove_free_space_cache(block_group);
8066
8067         spin_lock(&block_group->space_info->lock);
8068         block_group->space_info->total_bytes -= block_group->key.offset;
8069         block_group->space_info->bytes_readonly -= block_group->key.offset;
8070         spin_unlock(&block_group->space_info->lock);
8071
8072         btrfs_clear_space_info_full(root->fs_info);
8073
8074         btrfs_put_block_group(block_group);
8075         btrfs_put_block_group(block_group);
8076
8077         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8078         if (ret > 0)
8079                 ret = -EIO;
8080         if (ret < 0)
8081                 goto out;
8082
8083         ret = btrfs_del_item(trans, root, path);
8084 out:
8085         btrfs_free_path(path);
8086         return ret;
8087 }