]> bbs.cooldavid.org Git - net-next-2.6.git/blob - fs/btrfs/extent-tree.c
Btrfs: Integrate metadata reservation with start_transaction
[net-next-2.6.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include "compat.h"
27 #include "hash.h"
28 #include "ctree.h"
29 #include "disk-io.h"
30 #include "print-tree.h"
31 #include "transaction.h"
32 #include "volumes.h"
33 #include "locking.h"
34 #include "free-space-cache.h"
35
36 static int update_block_group(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               u64 bytenr, u64 num_bytes, int alloc);
39 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40                                  u64 num_bytes, int reserve, int sinfo);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *extent_root, u64 alloc_bytes,
62                           u64 flags, int force);
63 static int find_next_key(struct btrfs_path *path, int level,
64                          struct btrfs_key *key);
65 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
66                             int dump_block_groups);
67 static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
68                                 struct btrfs_root *root,
69                                 struct btrfs_space_info *sinfo, u64 num_bytes);
70 static int shrink_delalloc(struct btrfs_trans_handle *trans,
71                            struct btrfs_root *root,
72                            struct btrfs_space_info *sinfo, u64 to_reclaim);
73
74 static noinline int
75 block_group_cache_done(struct btrfs_block_group_cache *cache)
76 {
77         smp_mb();
78         return cache->cached == BTRFS_CACHE_FINISHED;
79 }
80
81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
82 {
83         return (cache->flags & bits) == bits;
84 }
85
86 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
87 {
88         atomic_inc(&cache->count);
89 }
90
91 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92 {
93         if (atomic_dec_and_test(&cache->count)) {
94                 WARN_ON(cache->pinned > 0);
95                 WARN_ON(cache->reserved > 0);
96                 WARN_ON(cache->reserved_pinned > 0);
97                 kfree(cache);
98         }
99 }
100
101 /*
102  * this adds the block group to the fs_info rb tree for the block group
103  * cache
104  */
105 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
106                                 struct btrfs_block_group_cache *block_group)
107 {
108         struct rb_node **p;
109         struct rb_node *parent = NULL;
110         struct btrfs_block_group_cache *cache;
111
112         spin_lock(&info->block_group_cache_lock);
113         p = &info->block_group_cache_tree.rb_node;
114
115         while (*p) {
116                 parent = *p;
117                 cache = rb_entry(parent, struct btrfs_block_group_cache,
118                                  cache_node);
119                 if (block_group->key.objectid < cache->key.objectid) {
120                         p = &(*p)->rb_left;
121                 } else if (block_group->key.objectid > cache->key.objectid) {
122                         p = &(*p)->rb_right;
123                 } else {
124                         spin_unlock(&info->block_group_cache_lock);
125                         return -EEXIST;
126                 }
127         }
128
129         rb_link_node(&block_group->cache_node, parent, p);
130         rb_insert_color(&block_group->cache_node,
131                         &info->block_group_cache_tree);
132         spin_unlock(&info->block_group_cache_lock);
133
134         return 0;
135 }
136
137 /*
138  * This will return the block group at or after bytenr if contains is 0, else
139  * it will return the block group that contains the bytenr
140  */
141 static struct btrfs_block_group_cache *
142 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
143                               int contains)
144 {
145         struct btrfs_block_group_cache *cache, *ret = NULL;
146         struct rb_node *n;
147         u64 end, start;
148
149         spin_lock(&info->block_group_cache_lock);
150         n = info->block_group_cache_tree.rb_node;
151
152         while (n) {
153                 cache = rb_entry(n, struct btrfs_block_group_cache,
154                                  cache_node);
155                 end = cache->key.objectid + cache->key.offset - 1;
156                 start = cache->key.objectid;
157
158                 if (bytenr < start) {
159                         if (!contains && (!ret || start < ret->key.objectid))
160                                 ret = cache;
161                         n = n->rb_left;
162                 } else if (bytenr > start) {
163                         if (contains && bytenr <= end) {
164                                 ret = cache;
165                                 break;
166                         }
167                         n = n->rb_right;
168                 } else {
169                         ret = cache;
170                         break;
171                 }
172         }
173         if (ret)
174                 btrfs_get_block_group(ret);
175         spin_unlock(&info->block_group_cache_lock);
176
177         return ret;
178 }
179
180 static int add_excluded_extent(struct btrfs_root *root,
181                                u64 start, u64 num_bytes)
182 {
183         u64 end = start + num_bytes - 1;
184         set_extent_bits(&root->fs_info->freed_extents[0],
185                         start, end, EXTENT_UPTODATE, GFP_NOFS);
186         set_extent_bits(&root->fs_info->freed_extents[1],
187                         start, end, EXTENT_UPTODATE, GFP_NOFS);
188         return 0;
189 }
190
191 static void free_excluded_extents(struct btrfs_root *root,
192                                   struct btrfs_block_group_cache *cache)
193 {
194         u64 start, end;
195
196         start = cache->key.objectid;
197         end = start + cache->key.offset - 1;
198
199         clear_extent_bits(&root->fs_info->freed_extents[0],
200                           start, end, EXTENT_UPTODATE, GFP_NOFS);
201         clear_extent_bits(&root->fs_info->freed_extents[1],
202                           start, end, EXTENT_UPTODATE, GFP_NOFS);
203 }
204
205 static int exclude_super_stripes(struct btrfs_root *root,
206                                  struct btrfs_block_group_cache *cache)
207 {
208         u64 bytenr;
209         u64 *logical;
210         int stripe_len;
211         int i, nr, ret;
212
213         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
214                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
215                 cache->bytes_super += stripe_len;
216                 ret = add_excluded_extent(root, cache->key.objectid,
217                                           stripe_len);
218                 BUG_ON(ret);
219         }
220
221         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
222                 bytenr = btrfs_sb_offset(i);
223                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
224                                        cache->key.objectid, bytenr,
225                                        0, &logical, &nr, &stripe_len);
226                 BUG_ON(ret);
227
228                 while (nr--) {
229                         cache->bytes_super += stripe_len;
230                         ret = add_excluded_extent(root, logical[nr],
231                                                   stripe_len);
232                         BUG_ON(ret);
233                 }
234
235                 kfree(logical);
236         }
237         return 0;
238 }
239
240 static struct btrfs_caching_control *
241 get_caching_control(struct btrfs_block_group_cache *cache)
242 {
243         struct btrfs_caching_control *ctl;
244
245         spin_lock(&cache->lock);
246         if (cache->cached != BTRFS_CACHE_STARTED) {
247                 spin_unlock(&cache->lock);
248                 return NULL;
249         }
250
251         ctl = cache->caching_ctl;
252         atomic_inc(&ctl->count);
253         spin_unlock(&cache->lock);
254         return ctl;
255 }
256
257 static void put_caching_control(struct btrfs_caching_control *ctl)
258 {
259         if (atomic_dec_and_test(&ctl->count))
260                 kfree(ctl);
261 }
262
263 /*
264  * this is only called by cache_block_group, since we could have freed extents
265  * we need to check the pinned_extents for any extents that can't be used yet
266  * since their free space will be released as soon as the transaction commits.
267  */
268 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
269                               struct btrfs_fs_info *info, u64 start, u64 end)
270 {
271         u64 extent_start, extent_end, size, total_added = 0;
272         int ret;
273
274         while (start < end) {
275                 ret = find_first_extent_bit(info->pinned_extents, start,
276                                             &extent_start, &extent_end,
277                                             EXTENT_DIRTY | EXTENT_UPTODATE);
278                 if (ret)
279                         break;
280
281                 if (extent_start <= start) {
282                         start = extent_end + 1;
283                 } else if (extent_start > start && extent_start < end) {
284                         size = extent_start - start;
285                         total_added += size;
286                         ret = btrfs_add_free_space(block_group, start,
287                                                    size);
288                         BUG_ON(ret);
289                         start = extent_end + 1;
290                 } else {
291                         break;
292                 }
293         }
294
295         if (start < end) {
296                 size = end - start;
297                 total_added += size;
298                 ret = btrfs_add_free_space(block_group, start, size);
299                 BUG_ON(ret);
300         }
301
302         return total_added;
303 }
304
305 static int caching_kthread(void *data)
306 {
307         struct btrfs_block_group_cache *block_group = data;
308         struct btrfs_fs_info *fs_info = block_group->fs_info;
309         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
310         struct btrfs_root *extent_root = fs_info->extent_root;
311         struct btrfs_path *path;
312         struct extent_buffer *leaf;
313         struct btrfs_key key;
314         u64 total_found = 0;
315         u64 last = 0;
316         u32 nritems;
317         int ret = 0;
318
319         path = btrfs_alloc_path();
320         if (!path)
321                 return -ENOMEM;
322
323         exclude_super_stripes(extent_root, block_group);
324         spin_lock(&block_group->space_info->lock);
325         block_group->space_info->bytes_readonly += block_group->bytes_super;
326         spin_unlock(&block_group->space_info->lock);
327
328         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329
330         /*
331          * We don't want to deadlock with somebody trying to allocate a new
332          * extent for the extent root while also trying to search the extent
333          * root to add free space.  So we skip locking and search the commit
334          * root, since its read-only
335          */
336         path->skip_locking = 1;
337         path->search_commit_root = 1;
338         path->reada = 2;
339
340         key.objectid = last;
341         key.offset = 0;
342         key.type = BTRFS_EXTENT_ITEM_KEY;
343 again:
344         mutex_lock(&caching_ctl->mutex);
345         /* need to make sure the commit_root doesn't disappear */
346         down_read(&fs_info->extent_commit_sem);
347
348         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
349         if (ret < 0)
350                 goto err;
351
352         leaf = path->nodes[0];
353         nritems = btrfs_header_nritems(leaf);
354
355         while (1) {
356                 smp_mb();
357                 if (fs_info->closing > 1) {
358                         last = (u64)-1;
359                         break;
360                 }
361
362                 if (path->slots[0] < nritems) {
363                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
364                 } else {
365                         ret = find_next_key(path, 0, &key);
366                         if (ret)
367                                 break;
368
369                         caching_ctl->progress = last;
370                         btrfs_release_path(extent_root, path);
371                         up_read(&fs_info->extent_commit_sem);
372                         mutex_unlock(&caching_ctl->mutex);
373                         if (btrfs_transaction_in_commit(fs_info))
374                                 schedule_timeout(1);
375                         else
376                                 cond_resched();
377                         goto again;
378                 }
379
380                 if (key.objectid < block_group->key.objectid) {
381                         path->slots[0]++;
382                         continue;
383                 }
384
385                 if (key.objectid >= block_group->key.objectid +
386                     block_group->key.offset)
387                         break;
388
389                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
390                         total_found += add_new_free_space(block_group,
391                                                           fs_info, last,
392                                                           key.objectid);
393                         last = key.objectid + key.offset;
394
395                         if (total_found > (1024 * 1024 * 2)) {
396                                 total_found = 0;
397                                 wake_up(&caching_ctl->wait);
398                         }
399                 }
400                 path->slots[0]++;
401         }
402         ret = 0;
403
404         total_found += add_new_free_space(block_group, fs_info, last,
405                                           block_group->key.objectid +
406                                           block_group->key.offset);
407         caching_ctl->progress = (u64)-1;
408
409         spin_lock(&block_group->lock);
410         block_group->caching_ctl = NULL;
411         block_group->cached = BTRFS_CACHE_FINISHED;
412         spin_unlock(&block_group->lock);
413
414 err:
415         btrfs_free_path(path);
416         up_read(&fs_info->extent_commit_sem);
417
418         free_excluded_extents(extent_root, block_group);
419
420         mutex_unlock(&caching_ctl->mutex);
421         wake_up(&caching_ctl->wait);
422
423         put_caching_control(caching_ctl);
424         atomic_dec(&block_group->space_info->caching_threads);
425         btrfs_put_block_group(block_group);
426
427         return 0;
428 }
429
430 static int cache_block_group(struct btrfs_block_group_cache *cache)
431 {
432         struct btrfs_fs_info *fs_info = cache->fs_info;
433         struct btrfs_caching_control *caching_ctl;
434         struct task_struct *tsk;
435         int ret = 0;
436
437         smp_mb();
438         if (cache->cached != BTRFS_CACHE_NO)
439                 return 0;
440
441         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
442         BUG_ON(!caching_ctl);
443
444         INIT_LIST_HEAD(&caching_ctl->list);
445         mutex_init(&caching_ctl->mutex);
446         init_waitqueue_head(&caching_ctl->wait);
447         caching_ctl->block_group = cache;
448         caching_ctl->progress = cache->key.objectid;
449         /* one for caching kthread, one for caching block group list */
450         atomic_set(&caching_ctl->count, 2);
451
452         spin_lock(&cache->lock);
453         if (cache->cached != BTRFS_CACHE_NO) {
454                 spin_unlock(&cache->lock);
455                 kfree(caching_ctl);
456                 return 0;
457         }
458         cache->caching_ctl = caching_ctl;
459         cache->cached = BTRFS_CACHE_STARTED;
460         spin_unlock(&cache->lock);
461
462         down_write(&fs_info->extent_commit_sem);
463         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
464         up_write(&fs_info->extent_commit_sem);
465
466         atomic_inc(&cache->space_info->caching_threads);
467         btrfs_get_block_group(cache);
468
469         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
470                           cache->key.objectid);
471         if (IS_ERR(tsk)) {
472                 ret = PTR_ERR(tsk);
473                 printk(KERN_ERR "error running thread %d\n", ret);
474                 BUG();
475         }
476
477         return ret;
478 }
479
480 /*
481  * return the block group that starts at or after bytenr
482  */
483 static struct btrfs_block_group_cache *
484 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
485 {
486         struct btrfs_block_group_cache *cache;
487
488         cache = block_group_cache_tree_search(info, bytenr, 0);
489
490         return cache;
491 }
492
493 /*
494  * return the block group that contains the given bytenr
495  */
496 struct btrfs_block_group_cache *btrfs_lookup_block_group(
497                                                  struct btrfs_fs_info *info,
498                                                  u64 bytenr)
499 {
500         struct btrfs_block_group_cache *cache;
501
502         cache = block_group_cache_tree_search(info, bytenr, 1);
503
504         return cache;
505 }
506
507 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
508                                                   u64 flags)
509 {
510         struct list_head *head = &info->space_info;
511         struct btrfs_space_info *found;
512
513         flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
514                  BTRFS_BLOCK_GROUP_METADATA;
515
516         rcu_read_lock();
517         list_for_each_entry_rcu(found, head, list) {
518                 if (found->flags == flags) {
519                         rcu_read_unlock();
520                         return found;
521                 }
522         }
523         rcu_read_unlock();
524         return NULL;
525 }
526
527 /*
528  * after adding space to the filesystem, we need to clear the full flags
529  * on all the space infos.
530  */
531 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
532 {
533         struct list_head *head = &info->space_info;
534         struct btrfs_space_info *found;
535
536         rcu_read_lock();
537         list_for_each_entry_rcu(found, head, list)
538                 found->full = 0;
539         rcu_read_unlock();
540 }
541
542 static u64 div_factor(u64 num, int factor)
543 {
544         if (factor == 10)
545                 return num;
546         num *= factor;
547         do_div(num, 10);
548         return num;
549 }
550
551 u64 btrfs_find_block_group(struct btrfs_root *root,
552                            u64 search_start, u64 search_hint, int owner)
553 {
554         struct btrfs_block_group_cache *cache;
555         u64 used;
556         u64 last = max(search_hint, search_start);
557         u64 group_start = 0;
558         int full_search = 0;
559         int factor = 9;
560         int wrapped = 0;
561 again:
562         while (1) {
563                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
564                 if (!cache)
565                         break;
566
567                 spin_lock(&cache->lock);
568                 last = cache->key.objectid + cache->key.offset;
569                 used = btrfs_block_group_used(&cache->item);
570
571                 if ((full_search || !cache->ro) &&
572                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
573                         if (used + cache->pinned + cache->reserved <
574                             div_factor(cache->key.offset, factor)) {
575                                 group_start = cache->key.objectid;
576                                 spin_unlock(&cache->lock);
577                                 btrfs_put_block_group(cache);
578                                 goto found;
579                         }
580                 }
581                 spin_unlock(&cache->lock);
582                 btrfs_put_block_group(cache);
583                 cond_resched();
584         }
585         if (!wrapped) {
586                 last = search_start;
587                 wrapped = 1;
588                 goto again;
589         }
590         if (!full_search && factor < 10) {
591                 last = search_start;
592                 full_search = 1;
593                 factor = 10;
594                 goto again;
595         }
596 found:
597         return group_start;
598 }
599
600 /* simple helper to search for an existing extent at a given offset */
601 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
602 {
603         int ret;
604         struct btrfs_key key;
605         struct btrfs_path *path;
606
607         path = btrfs_alloc_path();
608         BUG_ON(!path);
609         key.objectid = start;
610         key.offset = len;
611         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
612         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
613                                 0, 0);
614         btrfs_free_path(path);
615         return ret;
616 }
617
618 /*
619  * helper function to lookup reference count and flags of extent.
620  *
621  * the head node for delayed ref is used to store the sum of all the
622  * reference count modifications queued up in the rbtree. the head
623  * node may also store the extent flags to set. This way you can check
624  * to see what the reference count and extent flags would be if all of
625  * the delayed refs are not processed.
626  */
627 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
628                              struct btrfs_root *root, u64 bytenr,
629                              u64 num_bytes, u64 *refs, u64 *flags)
630 {
631         struct btrfs_delayed_ref_head *head;
632         struct btrfs_delayed_ref_root *delayed_refs;
633         struct btrfs_path *path;
634         struct btrfs_extent_item *ei;
635         struct extent_buffer *leaf;
636         struct btrfs_key key;
637         u32 item_size;
638         u64 num_refs;
639         u64 extent_flags;
640         int ret;
641
642         path = btrfs_alloc_path();
643         if (!path)
644                 return -ENOMEM;
645
646         key.objectid = bytenr;
647         key.type = BTRFS_EXTENT_ITEM_KEY;
648         key.offset = num_bytes;
649         if (!trans) {
650                 path->skip_locking = 1;
651                 path->search_commit_root = 1;
652         }
653 again:
654         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
655                                 &key, path, 0, 0);
656         if (ret < 0)
657                 goto out_free;
658
659         if (ret == 0) {
660                 leaf = path->nodes[0];
661                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
662                 if (item_size >= sizeof(*ei)) {
663                         ei = btrfs_item_ptr(leaf, path->slots[0],
664                                             struct btrfs_extent_item);
665                         num_refs = btrfs_extent_refs(leaf, ei);
666                         extent_flags = btrfs_extent_flags(leaf, ei);
667                 } else {
668 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
669                         struct btrfs_extent_item_v0 *ei0;
670                         BUG_ON(item_size != sizeof(*ei0));
671                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
672                                              struct btrfs_extent_item_v0);
673                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
674                         /* FIXME: this isn't correct for data */
675                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
676 #else
677                         BUG();
678 #endif
679                 }
680                 BUG_ON(num_refs == 0);
681         } else {
682                 num_refs = 0;
683                 extent_flags = 0;
684                 ret = 0;
685         }
686
687         if (!trans)
688                 goto out;
689
690         delayed_refs = &trans->transaction->delayed_refs;
691         spin_lock(&delayed_refs->lock);
692         head = btrfs_find_delayed_ref_head(trans, bytenr);
693         if (head) {
694                 if (!mutex_trylock(&head->mutex)) {
695                         atomic_inc(&head->node.refs);
696                         spin_unlock(&delayed_refs->lock);
697
698                         btrfs_release_path(root->fs_info->extent_root, path);
699
700                         mutex_lock(&head->mutex);
701                         mutex_unlock(&head->mutex);
702                         btrfs_put_delayed_ref(&head->node);
703                         goto again;
704                 }
705                 if (head->extent_op && head->extent_op->update_flags)
706                         extent_flags |= head->extent_op->flags_to_set;
707                 else
708                         BUG_ON(num_refs == 0);
709
710                 num_refs += head->node.ref_mod;
711                 mutex_unlock(&head->mutex);
712         }
713         spin_unlock(&delayed_refs->lock);
714 out:
715         WARN_ON(num_refs == 0);
716         if (refs)
717                 *refs = num_refs;
718         if (flags)
719                 *flags = extent_flags;
720 out_free:
721         btrfs_free_path(path);
722         return ret;
723 }
724
725 /*
726  * Back reference rules.  Back refs have three main goals:
727  *
728  * 1) differentiate between all holders of references to an extent so that
729  *    when a reference is dropped we can make sure it was a valid reference
730  *    before freeing the extent.
731  *
732  * 2) Provide enough information to quickly find the holders of an extent
733  *    if we notice a given block is corrupted or bad.
734  *
735  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
736  *    maintenance.  This is actually the same as #2, but with a slightly
737  *    different use case.
738  *
739  * There are two kinds of back refs. The implicit back refs is optimized
740  * for pointers in non-shared tree blocks. For a given pointer in a block,
741  * back refs of this kind provide information about the block's owner tree
742  * and the pointer's key. These information allow us to find the block by
743  * b-tree searching. The full back refs is for pointers in tree blocks not
744  * referenced by their owner trees. The location of tree block is recorded
745  * in the back refs. Actually the full back refs is generic, and can be
746  * used in all cases the implicit back refs is used. The major shortcoming
747  * of the full back refs is its overhead. Every time a tree block gets
748  * COWed, we have to update back refs entry for all pointers in it.
749  *
750  * For a newly allocated tree block, we use implicit back refs for
751  * pointers in it. This means most tree related operations only involve
752  * implicit back refs. For a tree block created in old transaction, the
753  * only way to drop a reference to it is COW it. So we can detect the
754  * event that tree block loses its owner tree's reference and do the
755  * back refs conversion.
756  *
757  * When a tree block is COW'd through a tree, there are four cases:
758  *
759  * The reference count of the block is one and the tree is the block's
760  * owner tree. Nothing to do in this case.
761  *
762  * The reference count of the block is one and the tree is not the
763  * block's owner tree. In this case, full back refs is used for pointers
764  * in the block. Remove these full back refs, add implicit back refs for
765  * every pointers in the new block.
766  *
767  * The reference count of the block is greater than one and the tree is
768  * the block's owner tree. In this case, implicit back refs is used for
769  * pointers in the block. Add full back refs for every pointers in the
770  * block, increase lower level extents' reference counts. The original
771  * implicit back refs are entailed to the new block.
772  *
773  * The reference count of the block is greater than one and the tree is
774  * not the block's owner tree. Add implicit back refs for every pointer in
775  * the new block, increase lower level extents' reference count.
776  *
777  * Back Reference Key composing:
778  *
779  * The key objectid corresponds to the first byte in the extent,
780  * The key type is used to differentiate between types of back refs.
781  * There are different meanings of the key offset for different types
782  * of back refs.
783  *
784  * File extents can be referenced by:
785  *
786  * - multiple snapshots, subvolumes, or different generations in one subvol
787  * - different files inside a single subvolume
788  * - different offsets inside a file (bookend extents in file.c)
789  *
790  * The extent ref structure for the implicit back refs has fields for:
791  *
792  * - Objectid of the subvolume root
793  * - objectid of the file holding the reference
794  * - original offset in the file
795  * - how many bookend extents
796  *
797  * The key offset for the implicit back refs is hash of the first
798  * three fields.
799  *
800  * The extent ref structure for the full back refs has field for:
801  *
802  * - number of pointers in the tree leaf
803  *
804  * The key offset for the implicit back refs is the first byte of
805  * the tree leaf
806  *
807  * When a file extent is allocated, The implicit back refs is used.
808  * the fields are filled in:
809  *
810  *     (root_key.objectid, inode objectid, offset in file, 1)
811  *
812  * When a file extent is removed file truncation, we find the
813  * corresponding implicit back refs and check the following fields:
814  *
815  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
816  *
817  * Btree extents can be referenced by:
818  *
819  * - Different subvolumes
820  *
821  * Both the implicit back refs and the full back refs for tree blocks
822  * only consist of key. The key offset for the implicit back refs is
823  * objectid of block's owner tree. The key offset for the full back refs
824  * is the first byte of parent block.
825  *
826  * When implicit back refs is used, information about the lowest key and
827  * level of the tree block are required. These information are stored in
828  * tree block info structure.
829  */
830
831 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
832 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
833                                   struct btrfs_root *root,
834                                   struct btrfs_path *path,
835                                   u64 owner, u32 extra_size)
836 {
837         struct btrfs_extent_item *item;
838         struct btrfs_extent_item_v0 *ei0;
839         struct btrfs_extent_ref_v0 *ref0;
840         struct btrfs_tree_block_info *bi;
841         struct extent_buffer *leaf;
842         struct btrfs_key key;
843         struct btrfs_key found_key;
844         u32 new_size = sizeof(*item);
845         u64 refs;
846         int ret;
847
848         leaf = path->nodes[0];
849         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
850
851         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
852         ei0 = btrfs_item_ptr(leaf, path->slots[0],
853                              struct btrfs_extent_item_v0);
854         refs = btrfs_extent_refs_v0(leaf, ei0);
855
856         if (owner == (u64)-1) {
857                 while (1) {
858                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
859                                 ret = btrfs_next_leaf(root, path);
860                                 if (ret < 0)
861                                         return ret;
862                                 BUG_ON(ret > 0);
863                                 leaf = path->nodes[0];
864                         }
865                         btrfs_item_key_to_cpu(leaf, &found_key,
866                                               path->slots[0]);
867                         BUG_ON(key.objectid != found_key.objectid);
868                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
869                                 path->slots[0]++;
870                                 continue;
871                         }
872                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
873                                               struct btrfs_extent_ref_v0);
874                         owner = btrfs_ref_objectid_v0(leaf, ref0);
875                         break;
876                 }
877         }
878         btrfs_release_path(root, path);
879
880         if (owner < BTRFS_FIRST_FREE_OBJECTID)
881                 new_size += sizeof(*bi);
882
883         new_size -= sizeof(*ei0);
884         ret = btrfs_search_slot(trans, root, &key, path,
885                                 new_size + extra_size, 1);
886         if (ret < 0)
887                 return ret;
888         BUG_ON(ret);
889
890         ret = btrfs_extend_item(trans, root, path, new_size);
891         BUG_ON(ret);
892
893         leaf = path->nodes[0];
894         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
895         btrfs_set_extent_refs(leaf, item, refs);
896         /* FIXME: get real generation */
897         btrfs_set_extent_generation(leaf, item, 0);
898         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
899                 btrfs_set_extent_flags(leaf, item,
900                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
901                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
902                 bi = (struct btrfs_tree_block_info *)(item + 1);
903                 /* FIXME: get first key of the block */
904                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
905                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
906         } else {
907                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
908         }
909         btrfs_mark_buffer_dirty(leaf);
910         return 0;
911 }
912 #endif
913
914 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
915 {
916         u32 high_crc = ~(u32)0;
917         u32 low_crc = ~(u32)0;
918         __le64 lenum;
919
920         lenum = cpu_to_le64(root_objectid);
921         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
922         lenum = cpu_to_le64(owner);
923         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
924         lenum = cpu_to_le64(offset);
925         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
926
927         return ((u64)high_crc << 31) ^ (u64)low_crc;
928 }
929
930 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
931                                      struct btrfs_extent_data_ref *ref)
932 {
933         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
934                                     btrfs_extent_data_ref_objectid(leaf, ref),
935                                     btrfs_extent_data_ref_offset(leaf, ref));
936 }
937
938 static int match_extent_data_ref(struct extent_buffer *leaf,
939                                  struct btrfs_extent_data_ref *ref,
940                                  u64 root_objectid, u64 owner, u64 offset)
941 {
942         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
943             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
944             btrfs_extent_data_ref_offset(leaf, ref) != offset)
945                 return 0;
946         return 1;
947 }
948
949 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
950                                            struct btrfs_root *root,
951                                            struct btrfs_path *path,
952                                            u64 bytenr, u64 parent,
953                                            u64 root_objectid,
954                                            u64 owner, u64 offset)
955 {
956         struct btrfs_key key;
957         struct btrfs_extent_data_ref *ref;
958         struct extent_buffer *leaf;
959         u32 nritems;
960         int ret;
961         int recow;
962         int err = -ENOENT;
963
964         key.objectid = bytenr;
965         if (parent) {
966                 key.type = BTRFS_SHARED_DATA_REF_KEY;
967                 key.offset = parent;
968         } else {
969                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
970                 key.offset = hash_extent_data_ref(root_objectid,
971                                                   owner, offset);
972         }
973 again:
974         recow = 0;
975         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
976         if (ret < 0) {
977                 err = ret;
978                 goto fail;
979         }
980
981         if (parent) {
982                 if (!ret)
983                         return 0;
984 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
985                 key.type = BTRFS_EXTENT_REF_V0_KEY;
986                 btrfs_release_path(root, path);
987                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
988                 if (ret < 0) {
989                         err = ret;
990                         goto fail;
991                 }
992                 if (!ret)
993                         return 0;
994 #endif
995                 goto fail;
996         }
997
998         leaf = path->nodes[0];
999         nritems = btrfs_header_nritems(leaf);
1000         while (1) {
1001                 if (path->slots[0] >= nritems) {
1002                         ret = btrfs_next_leaf(root, path);
1003                         if (ret < 0)
1004                                 err = ret;
1005                         if (ret)
1006                                 goto fail;
1007
1008                         leaf = path->nodes[0];
1009                         nritems = btrfs_header_nritems(leaf);
1010                         recow = 1;
1011                 }
1012
1013                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1014                 if (key.objectid != bytenr ||
1015                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1016                         goto fail;
1017
1018                 ref = btrfs_item_ptr(leaf, path->slots[0],
1019                                      struct btrfs_extent_data_ref);
1020
1021                 if (match_extent_data_ref(leaf, ref, root_objectid,
1022                                           owner, offset)) {
1023                         if (recow) {
1024                                 btrfs_release_path(root, path);
1025                                 goto again;
1026                         }
1027                         err = 0;
1028                         break;
1029                 }
1030                 path->slots[0]++;
1031         }
1032 fail:
1033         return err;
1034 }
1035
1036 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1037                                            struct btrfs_root *root,
1038                                            struct btrfs_path *path,
1039                                            u64 bytenr, u64 parent,
1040                                            u64 root_objectid, u64 owner,
1041                                            u64 offset, int refs_to_add)
1042 {
1043         struct btrfs_key key;
1044         struct extent_buffer *leaf;
1045         u32 size;
1046         u32 num_refs;
1047         int ret;
1048
1049         key.objectid = bytenr;
1050         if (parent) {
1051                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1052                 key.offset = parent;
1053                 size = sizeof(struct btrfs_shared_data_ref);
1054         } else {
1055                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1056                 key.offset = hash_extent_data_ref(root_objectid,
1057                                                   owner, offset);
1058                 size = sizeof(struct btrfs_extent_data_ref);
1059         }
1060
1061         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1062         if (ret && ret != -EEXIST)
1063                 goto fail;
1064
1065         leaf = path->nodes[0];
1066         if (parent) {
1067                 struct btrfs_shared_data_ref *ref;
1068                 ref = btrfs_item_ptr(leaf, path->slots[0],
1069                                      struct btrfs_shared_data_ref);
1070                 if (ret == 0) {
1071                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1072                 } else {
1073                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1074                         num_refs += refs_to_add;
1075                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1076                 }
1077         } else {
1078                 struct btrfs_extent_data_ref *ref;
1079                 while (ret == -EEXIST) {
1080                         ref = btrfs_item_ptr(leaf, path->slots[0],
1081                                              struct btrfs_extent_data_ref);
1082                         if (match_extent_data_ref(leaf, ref, root_objectid,
1083                                                   owner, offset))
1084                                 break;
1085                         btrfs_release_path(root, path);
1086                         key.offset++;
1087                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1088                                                       size);
1089                         if (ret && ret != -EEXIST)
1090                                 goto fail;
1091
1092                         leaf = path->nodes[0];
1093                 }
1094                 ref = btrfs_item_ptr(leaf, path->slots[0],
1095                                      struct btrfs_extent_data_ref);
1096                 if (ret == 0) {
1097                         btrfs_set_extent_data_ref_root(leaf, ref,
1098                                                        root_objectid);
1099                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1100                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1101                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1102                 } else {
1103                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1104                         num_refs += refs_to_add;
1105                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1106                 }
1107         }
1108         btrfs_mark_buffer_dirty(leaf);
1109         ret = 0;
1110 fail:
1111         btrfs_release_path(root, path);
1112         return ret;
1113 }
1114
1115 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1116                                            struct btrfs_root *root,
1117                                            struct btrfs_path *path,
1118                                            int refs_to_drop)
1119 {
1120         struct btrfs_key key;
1121         struct btrfs_extent_data_ref *ref1 = NULL;
1122         struct btrfs_shared_data_ref *ref2 = NULL;
1123         struct extent_buffer *leaf;
1124         u32 num_refs = 0;
1125         int ret = 0;
1126
1127         leaf = path->nodes[0];
1128         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1129
1130         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1131                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1132                                       struct btrfs_extent_data_ref);
1133                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1134         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1135                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1136                                       struct btrfs_shared_data_ref);
1137                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1138 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1139         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1140                 struct btrfs_extent_ref_v0 *ref0;
1141                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1142                                       struct btrfs_extent_ref_v0);
1143                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1144 #endif
1145         } else {
1146                 BUG();
1147         }
1148
1149         BUG_ON(num_refs < refs_to_drop);
1150         num_refs -= refs_to_drop;
1151
1152         if (num_refs == 0) {
1153                 ret = btrfs_del_item(trans, root, path);
1154         } else {
1155                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1156                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1157                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1158                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1159 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1160                 else {
1161                         struct btrfs_extent_ref_v0 *ref0;
1162                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1163                                         struct btrfs_extent_ref_v0);
1164                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1165                 }
1166 #endif
1167                 btrfs_mark_buffer_dirty(leaf);
1168         }
1169         return ret;
1170 }
1171
1172 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1173                                           struct btrfs_path *path,
1174                                           struct btrfs_extent_inline_ref *iref)
1175 {
1176         struct btrfs_key key;
1177         struct extent_buffer *leaf;
1178         struct btrfs_extent_data_ref *ref1;
1179         struct btrfs_shared_data_ref *ref2;
1180         u32 num_refs = 0;
1181
1182         leaf = path->nodes[0];
1183         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1184         if (iref) {
1185                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1186                     BTRFS_EXTENT_DATA_REF_KEY) {
1187                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1188                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1189                 } else {
1190                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1191                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1192                 }
1193         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1194                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1195                                       struct btrfs_extent_data_ref);
1196                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1197         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1198                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1199                                       struct btrfs_shared_data_ref);
1200                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1201 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1202         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1203                 struct btrfs_extent_ref_v0 *ref0;
1204                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1205                                       struct btrfs_extent_ref_v0);
1206                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1207 #endif
1208         } else {
1209                 WARN_ON(1);
1210         }
1211         return num_refs;
1212 }
1213
1214 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1215                                           struct btrfs_root *root,
1216                                           struct btrfs_path *path,
1217                                           u64 bytenr, u64 parent,
1218                                           u64 root_objectid)
1219 {
1220         struct btrfs_key key;
1221         int ret;
1222
1223         key.objectid = bytenr;
1224         if (parent) {
1225                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1226                 key.offset = parent;
1227         } else {
1228                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1229                 key.offset = root_objectid;
1230         }
1231
1232         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1233         if (ret > 0)
1234                 ret = -ENOENT;
1235 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1236         if (ret == -ENOENT && parent) {
1237                 btrfs_release_path(root, path);
1238                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1239                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1240                 if (ret > 0)
1241                         ret = -ENOENT;
1242         }
1243 #endif
1244         return ret;
1245 }
1246
1247 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1248                                           struct btrfs_root *root,
1249                                           struct btrfs_path *path,
1250                                           u64 bytenr, u64 parent,
1251                                           u64 root_objectid)
1252 {
1253         struct btrfs_key key;
1254         int ret;
1255
1256         key.objectid = bytenr;
1257         if (parent) {
1258                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1259                 key.offset = parent;
1260         } else {
1261                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1262                 key.offset = root_objectid;
1263         }
1264
1265         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1266         btrfs_release_path(root, path);
1267         return ret;
1268 }
1269
1270 static inline int extent_ref_type(u64 parent, u64 owner)
1271 {
1272         int type;
1273         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1274                 if (parent > 0)
1275                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1276                 else
1277                         type = BTRFS_TREE_BLOCK_REF_KEY;
1278         } else {
1279                 if (parent > 0)
1280                         type = BTRFS_SHARED_DATA_REF_KEY;
1281                 else
1282                         type = BTRFS_EXTENT_DATA_REF_KEY;
1283         }
1284         return type;
1285 }
1286
1287 static int find_next_key(struct btrfs_path *path, int level,
1288                          struct btrfs_key *key)
1289
1290 {
1291         for (; level < BTRFS_MAX_LEVEL; level++) {
1292                 if (!path->nodes[level])
1293                         break;
1294                 if (path->slots[level] + 1 >=
1295                     btrfs_header_nritems(path->nodes[level]))
1296                         continue;
1297                 if (level == 0)
1298                         btrfs_item_key_to_cpu(path->nodes[level], key,
1299                                               path->slots[level] + 1);
1300                 else
1301                         btrfs_node_key_to_cpu(path->nodes[level], key,
1302                                               path->slots[level] + 1);
1303                 return 0;
1304         }
1305         return 1;
1306 }
1307
1308 /*
1309  * look for inline back ref. if back ref is found, *ref_ret is set
1310  * to the address of inline back ref, and 0 is returned.
1311  *
1312  * if back ref isn't found, *ref_ret is set to the address where it
1313  * should be inserted, and -ENOENT is returned.
1314  *
1315  * if insert is true and there are too many inline back refs, the path
1316  * points to the extent item, and -EAGAIN is returned.
1317  *
1318  * NOTE: inline back refs are ordered in the same way that back ref
1319  *       items in the tree are ordered.
1320  */
1321 static noinline_for_stack
1322 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1323                                  struct btrfs_root *root,
1324                                  struct btrfs_path *path,
1325                                  struct btrfs_extent_inline_ref **ref_ret,
1326                                  u64 bytenr, u64 num_bytes,
1327                                  u64 parent, u64 root_objectid,
1328                                  u64 owner, u64 offset, int insert)
1329 {
1330         struct btrfs_key key;
1331         struct extent_buffer *leaf;
1332         struct btrfs_extent_item *ei;
1333         struct btrfs_extent_inline_ref *iref;
1334         u64 flags;
1335         u64 item_size;
1336         unsigned long ptr;
1337         unsigned long end;
1338         int extra_size;
1339         int type;
1340         int want;
1341         int ret;
1342         int err = 0;
1343
1344         key.objectid = bytenr;
1345         key.type = BTRFS_EXTENT_ITEM_KEY;
1346         key.offset = num_bytes;
1347
1348         want = extent_ref_type(parent, owner);
1349         if (insert) {
1350                 extra_size = btrfs_extent_inline_ref_size(want);
1351                 path->keep_locks = 1;
1352         } else
1353                 extra_size = -1;
1354         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1355         if (ret < 0) {
1356                 err = ret;
1357                 goto out;
1358         }
1359         BUG_ON(ret);
1360
1361         leaf = path->nodes[0];
1362         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1363 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1364         if (item_size < sizeof(*ei)) {
1365                 if (!insert) {
1366                         err = -ENOENT;
1367                         goto out;
1368                 }
1369                 ret = convert_extent_item_v0(trans, root, path, owner,
1370                                              extra_size);
1371                 if (ret < 0) {
1372                         err = ret;
1373                         goto out;
1374                 }
1375                 leaf = path->nodes[0];
1376                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1377         }
1378 #endif
1379         BUG_ON(item_size < sizeof(*ei));
1380
1381         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1382         flags = btrfs_extent_flags(leaf, ei);
1383
1384         ptr = (unsigned long)(ei + 1);
1385         end = (unsigned long)ei + item_size;
1386
1387         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1388                 ptr += sizeof(struct btrfs_tree_block_info);
1389                 BUG_ON(ptr > end);
1390         } else {
1391                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1392         }
1393
1394         err = -ENOENT;
1395         while (1) {
1396                 if (ptr >= end) {
1397                         WARN_ON(ptr > end);
1398                         break;
1399                 }
1400                 iref = (struct btrfs_extent_inline_ref *)ptr;
1401                 type = btrfs_extent_inline_ref_type(leaf, iref);
1402                 if (want < type)
1403                         break;
1404                 if (want > type) {
1405                         ptr += btrfs_extent_inline_ref_size(type);
1406                         continue;
1407                 }
1408
1409                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1410                         struct btrfs_extent_data_ref *dref;
1411                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1412                         if (match_extent_data_ref(leaf, dref, root_objectid,
1413                                                   owner, offset)) {
1414                                 err = 0;
1415                                 break;
1416                         }
1417                         if (hash_extent_data_ref_item(leaf, dref) <
1418                             hash_extent_data_ref(root_objectid, owner, offset))
1419                                 break;
1420                 } else {
1421                         u64 ref_offset;
1422                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1423                         if (parent > 0) {
1424                                 if (parent == ref_offset) {
1425                                         err = 0;
1426                                         break;
1427                                 }
1428                                 if (ref_offset < parent)
1429                                         break;
1430                         } else {
1431                                 if (root_objectid == ref_offset) {
1432                                         err = 0;
1433                                         break;
1434                                 }
1435                                 if (ref_offset < root_objectid)
1436                                         break;
1437                         }
1438                 }
1439                 ptr += btrfs_extent_inline_ref_size(type);
1440         }
1441         if (err == -ENOENT && insert) {
1442                 if (item_size + extra_size >=
1443                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1444                         err = -EAGAIN;
1445                         goto out;
1446                 }
1447                 /*
1448                  * To add new inline back ref, we have to make sure
1449                  * there is no corresponding back ref item.
1450                  * For simplicity, we just do not add new inline back
1451                  * ref if there is any kind of item for this block
1452                  */
1453                 if (find_next_key(path, 0, &key) == 0 &&
1454                     key.objectid == bytenr &&
1455                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1456                         err = -EAGAIN;
1457                         goto out;
1458                 }
1459         }
1460         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1461 out:
1462         if (insert) {
1463                 path->keep_locks = 0;
1464                 btrfs_unlock_up_safe(path, 1);
1465         }
1466         return err;
1467 }
1468
1469 /*
1470  * helper to add new inline back ref
1471  */
1472 static noinline_for_stack
1473 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1474                                 struct btrfs_root *root,
1475                                 struct btrfs_path *path,
1476                                 struct btrfs_extent_inline_ref *iref,
1477                                 u64 parent, u64 root_objectid,
1478                                 u64 owner, u64 offset, int refs_to_add,
1479                                 struct btrfs_delayed_extent_op *extent_op)
1480 {
1481         struct extent_buffer *leaf;
1482         struct btrfs_extent_item *ei;
1483         unsigned long ptr;
1484         unsigned long end;
1485         unsigned long item_offset;
1486         u64 refs;
1487         int size;
1488         int type;
1489         int ret;
1490
1491         leaf = path->nodes[0];
1492         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1493         item_offset = (unsigned long)iref - (unsigned long)ei;
1494
1495         type = extent_ref_type(parent, owner);
1496         size = btrfs_extent_inline_ref_size(type);
1497
1498         ret = btrfs_extend_item(trans, root, path, size);
1499         BUG_ON(ret);
1500
1501         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1502         refs = btrfs_extent_refs(leaf, ei);
1503         refs += refs_to_add;
1504         btrfs_set_extent_refs(leaf, ei, refs);
1505         if (extent_op)
1506                 __run_delayed_extent_op(extent_op, leaf, ei);
1507
1508         ptr = (unsigned long)ei + item_offset;
1509         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1510         if (ptr < end - size)
1511                 memmove_extent_buffer(leaf, ptr + size, ptr,
1512                                       end - size - ptr);
1513
1514         iref = (struct btrfs_extent_inline_ref *)ptr;
1515         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1516         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1517                 struct btrfs_extent_data_ref *dref;
1518                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1519                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1520                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1521                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1522                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1523         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1524                 struct btrfs_shared_data_ref *sref;
1525                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1526                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1527                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1528         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1529                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1530         } else {
1531                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1532         }
1533         btrfs_mark_buffer_dirty(leaf);
1534         return 0;
1535 }
1536
1537 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1538                                  struct btrfs_root *root,
1539                                  struct btrfs_path *path,
1540                                  struct btrfs_extent_inline_ref **ref_ret,
1541                                  u64 bytenr, u64 num_bytes, u64 parent,
1542                                  u64 root_objectid, u64 owner, u64 offset)
1543 {
1544         int ret;
1545
1546         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1547                                            bytenr, num_bytes, parent,
1548                                            root_objectid, owner, offset, 0);
1549         if (ret != -ENOENT)
1550                 return ret;
1551
1552         btrfs_release_path(root, path);
1553         *ref_ret = NULL;
1554
1555         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1556                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1557                                             root_objectid);
1558         } else {
1559                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1560                                              root_objectid, owner, offset);
1561         }
1562         return ret;
1563 }
1564
1565 /*
1566  * helper to update/remove inline back ref
1567  */
1568 static noinline_for_stack
1569 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1570                                  struct btrfs_root *root,
1571                                  struct btrfs_path *path,
1572                                  struct btrfs_extent_inline_ref *iref,
1573                                  int refs_to_mod,
1574                                  struct btrfs_delayed_extent_op *extent_op)
1575 {
1576         struct extent_buffer *leaf;
1577         struct btrfs_extent_item *ei;
1578         struct btrfs_extent_data_ref *dref = NULL;
1579         struct btrfs_shared_data_ref *sref = NULL;
1580         unsigned long ptr;
1581         unsigned long end;
1582         u32 item_size;
1583         int size;
1584         int type;
1585         int ret;
1586         u64 refs;
1587
1588         leaf = path->nodes[0];
1589         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1590         refs = btrfs_extent_refs(leaf, ei);
1591         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1592         refs += refs_to_mod;
1593         btrfs_set_extent_refs(leaf, ei, refs);
1594         if (extent_op)
1595                 __run_delayed_extent_op(extent_op, leaf, ei);
1596
1597         type = btrfs_extent_inline_ref_type(leaf, iref);
1598
1599         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1600                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1601                 refs = btrfs_extent_data_ref_count(leaf, dref);
1602         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1603                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1604                 refs = btrfs_shared_data_ref_count(leaf, sref);
1605         } else {
1606                 refs = 1;
1607                 BUG_ON(refs_to_mod != -1);
1608         }
1609
1610         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1611         refs += refs_to_mod;
1612
1613         if (refs > 0) {
1614                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1615                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1616                 else
1617                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1618         } else {
1619                 size =  btrfs_extent_inline_ref_size(type);
1620                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1621                 ptr = (unsigned long)iref;
1622                 end = (unsigned long)ei + item_size;
1623                 if (ptr + size < end)
1624                         memmove_extent_buffer(leaf, ptr, ptr + size,
1625                                               end - ptr - size);
1626                 item_size -= size;
1627                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1628                 BUG_ON(ret);
1629         }
1630         btrfs_mark_buffer_dirty(leaf);
1631         return 0;
1632 }
1633
1634 static noinline_for_stack
1635 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1636                                  struct btrfs_root *root,
1637                                  struct btrfs_path *path,
1638                                  u64 bytenr, u64 num_bytes, u64 parent,
1639                                  u64 root_objectid, u64 owner,
1640                                  u64 offset, int refs_to_add,
1641                                  struct btrfs_delayed_extent_op *extent_op)
1642 {
1643         struct btrfs_extent_inline_ref *iref;
1644         int ret;
1645
1646         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1647                                            bytenr, num_bytes, parent,
1648                                            root_objectid, owner, offset, 1);
1649         if (ret == 0) {
1650                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1651                 ret = update_inline_extent_backref(trans, root, path, iref,
1652                                                    refs_to_add, extent_op);
1653         } else if (ret == -ENOENT) {
1654                 ret = setup_inline_extent_backref(trans, root, path, iref,
1655                                                   parent, root_objectid,
1656                                                   owner, offset, refs_to_add,
1657                                                   extent_op);
1658         }
1659         return ret;
1660 }
1661
1662 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1663                                  struct btrfs_root *root,
1664                                  struct btrfs_path *path,
1665                                  u64 bytenr, u64 parent, u64 root_objectid,
1666                                  u64 owner, u64 offset, int refs_to_add)
1667 {
1668         int ret;
1669         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1670                 BUG_ON(refs_to_add != 1);
1671                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1672                                             parent, root_objectid);
1673         } else {
1674                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1675                                              parent, root_objectid,
1676                                              owner, offset, refs_to_add);
1677         }
1678         return ret;
1679 }
1680
1681 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1682                                  struct btrfs_root *root,
1683                                  struct btrfs_path *path,
1684                                  struct btrfs_extent_inline_ref *iref,
1685                                  int refs_to_drop, int is_data)
1686 {
1687         int ret;
1688
1689         BUG_ON(!is_data && refs_to_drop != 1);
1690         if (iref) {
1691                 ret = update_inline_extent_backref(trans, root, path, iref,
1692                                                    -refs_to_drop, NULL);
1693         } else if (is_data) {
1694                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1695         } else {
1696                 ret = btrfs_del_item(trans, root, path);
1697         }
1698         return ret;
1699 }
1700
1701 static void btrfs_issue_discard(struct block_device *bdev,
1702                                 u64 start, u64 len)
1703 {
1704         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1705                              DISCARD_FL_BARRIER);
1706 }
1707
1708 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1709                                 u64 num_bytes)
1710 {
1711         int ret;
1712         u64 map_length = num_bytes;
1713         struct btrfs_multi_bio *multi = NULL;
1714
1715         if (!btrfs_test_opt(root, DISCARD))
1716                 return 0;
1717
1718         /* Tell the block device(s) that the sectors can be discarded */
1719         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1720                               bytenr, &map_length, &multi, 0);
1721         if (!ret) {
1722                 struct btrfs_bio_stripe *stripe = multi->stripes;
1723                 int i;
1724
1725                 if (map_length > num_bytes)
1726                         map_length = num_bytes;
1727
1728                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1729                         btrfs_issue_discard(stripe->dev->bdev,
1730                                             stripe->physical,
1731                                             map_length);
1732                 }
1733                 kfree(multi);
1734         }
1735
1736         return ret;
1737 }
1738
1739 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1740                          struct btrfs_root *root,
1741                          u64 bytenr, u64 num_bytes, u64 parent,
1742                          u64 root_objectid, u64 owner, u64 offset)
1743 {
1744         int ret;
1745         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1746                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1747
1748         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1749                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1750                                         parent, root_objectid, (int)owner,
1751                                         BTRFS_ADD_DELAYED_REF, NULL);
1752         } else {
1753                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1754                                         parent, root_objectid, owner, offset,
1755                                         BTRFS_ADD_DELAYED_REF, NULL);
1756         }
1757         return ret;
1758 }
1759
1760 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1761                                   struct btrfs_root *root,
1762                                   u64 bytenr, u64 num_bytes,
1763                                   u64 parent, u64 root_objectid,
1764                                   u64 owner, u64 offset, int refs_to_add,
1765                                   struct btrfs_delayed_extent_op *extent_op)
1766 {
1767         struct btrfs_path *path;
1768         struct extent_buffer *leaf;
1769         struct btrfs_extent_item *item;
1770         u64 refs;
1771         int ret;
1772         int err = 0;
1773
1774         path = btrfs_alloc_path();
1775         if (!path)
1776                 return -ENOMEM;
1777
1778         path->reada = 1;
1779         path->leave_spinning = 1;
1780         /* this will setup the path even if it fails to insert the back ref */
1781         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1782                                            path, bytenr, num_bytes, parent,
1783                                            root_objectid, owner, offset,
1784                                            refs_to_add, extent_op);
1785         if (ret == 0)
1786                 goto out;
1787
1788         if (ret != -EAGAIN) {
1789                 err = ret;
1790                 goto out;
1791         }
1792
1793         leaf = path->nodes[0];
1794         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1795         refs = btrfs_extent_refs(leaf, item);
1796         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1797         if (extent_op)
1798                 __run_delayed_extent_op(extent_op, leaf, item);
1799
1800         btrfs_mark_buffer_dirty(leaf);
1801         btrfs_release_path(root->fs_info->extent_root, path);
1802
1803         path->reada = 1;
1804         path->leave_spinning = 1;
1805
1806         /* now insert the actual backref */
1807         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1808                                     path, bytenr, parent, root_objectid,
1809                                     owner, offset, refs_to_add);
1810         BUG_ON(ret);
1811 out:
1812         btrfs_free_path(path);
1813         return err;
1814 }
1815
1816 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1817                                 struct btrfs_root *root,
1818                                 struct btrfs_delayed_ref_node *node,
1819                                 struct btrfs_delayed_extent_op *extent_op,
1820                                 int insert_reserved)
1821 {
1822         int ret = 0;
1823         struct btrfs_delayed_data_ref *ref;
1824         struct btrfs_key ins;
1825         u64 parent = 0;
1826         u64 ref_root = 0;
1827         u64 flags = 0;
1828
1829         ins.objectid = node->bytenr;
1830         ins.offset = node->num_bytes;
1831         ins.type = BTRFS_EXTENT_ITEM_KEY;
1832
1833         ref = btrfs_delayed_node_to_data_ref(node);
1834         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1835                 parent = ref->parent;
1836         else
1837                 ref_root = ref->root;
1838
1839         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1840                 if (extent_op) {
1841                         BUG_ON(extent_op->update_key);
1842                         flags |= extent_op->flags_to_set;
1843                 }
1844                 ret = alloc_reserved_file_extent(trans, root,
1845                                                  parent, ref_root, flags,
1846                                                  ref->objectid, ref->offset,
1847                                                  &ins, node->ref_mod);
1848         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1849                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1850                                              node->num_bytes, parent,
1851                                              ref_root, ref->objectid,
1852                                              ref->offset, node->ref_mod,
1853                                              extent_op);
1854         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1855                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1856                                           node->num_bytes, parent,
1857                                           ref_root, ref->objectid,
1858                                           ref->offset, node->ref_mod,
1859                                           extent_op);
1860         } else {
1861                 BUG();
1862         }
1863         return ret;
1864 }
1865
1866 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1867                                     struct extent_buffer *leaf,
1868                                     struct btrfs_extent_item *ei)
1869 {
1870         u64 flags = btrfs_extent_flags(leaf, ei);
1871         if (extent_op->update_flags) {
1872                 flags |= extent_op->flags_to_set;
1873                 btrfs_set_extent_flags(leaf, ei, flags);
1874         }
1875
1876         if (extent_op->update_key) {
1877                 struct btrfs_tree_block_info *bi;
1878                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1879                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1880                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1881         }
1882 }
1883
1884 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1885                                  struct btrfs_root *root,
1886                                  struct btrfs_delayed_ref_node *node,
1887                                  struct btrfs_delayed_extent_op *extent_op)
1888 {
1889         struct btrfs_key key;
1890         struct btrfs_path *path;
1891         struct btrfs_extent_item *ei;
1892         struct extent_buffer *leaf;
1893         u32 item_size;
1894         int ret;
1895         int err = 0;
1896
1897         path = btrfs_alloc_path();
1898         if (!path)
1899                 return -ENOMEM;
1900
1901         key.objectid = node->bytenr;
1902         key.type = BTRFS_EXTENT_ITEM_KEY;
1903         key.offset = node->num_bytes;
1904
1905         path->reada = 1;
1906         path->leave_spinning = 1;
1907         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1908                                 path, 0, 1);
1909         if (ret < 0) {
1910                 err = ret;
1911                 goto out;
1912         }
1913         if (ret > 0) {
1914                 err = -EIO;
1915                 goto out;
1916         }
1917
1918         leaf = path->nodes[0];
1919         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1920 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1921         if (item_size < sizeof(*ei)) {
1922                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1923                                              path, (u64)-1, 0);
1924                 if (ret < 0) {
1925                         err = ret;
1926                         goto out;
1927                 }
1928                 leaf = path->nodes[0];
1929                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1930         }
1931 #endif
1932         BUG_ON(item_size < sizeof(*ei));
1933         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1934         __run_delayed_extent_op(extent_op, leaf, ei);
1935
1936         btrfs_mark_buffer_dirty(leaf);
1937 out:
1938         btrfs_free_path(path);
1939         return err;
1940 }
1941
1942 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1943                                 struct btrfs_root *root,
1944                                 struct btrfs_delayed_ref_node *node,
1945                                 struct btrfs_delayed_extent_op *extent_op,
1946                                 int insert_reserved)
1947 {
1948         int ret = 0;
1949         struct btrfs_delayed_tree_ref *ref;
1950         struct btrfs_key ins;
1951         u64 parent = 0;
1952         u64 ref_root = 0;
1953
1954         ins.objectid = node->bytenr;
1955         ins.offset = node->num_bytes;
1956         ins.type = BTRFS_EXTENT_ITEM_KEY;
1957
1958         ref = btrfs_delayed_node_to_tree_ref(node);
1959         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1960                 parent = ref->parent;
1961         else
1962                 ref_root = ref->root;
1963
1964         BUG_ON(node->ref_mod != 1);
1965         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1966                 BUG_ON(!extent_op || !extent_op->update_flags ||
1967                        !extent_op->update_key);
1968                 ret = alloc_reserved_tree_block(trans, root,
1969                                                 parent, ref_root,
1970                                                 extent_op->flags_to_set,
1971                                                 &extent_op->key,
1972                                                 ref->level, &ins);
1973         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1974                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1975                                              node->num_bytes, parent, ref_root,
1976                                              ref->level, 0, 1, extent_op);
1977         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1978                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1979                                           node->num_bytes, parent, ref_root,
1980                                           ref->level, 0, 1, extent_op);
1981         } else {
1982                 BUG();
1983         }
1984         return ret;
1985 }
1986
1987 /* helper function to actually process a single delayed ref entry */
1988 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1989                                struct btrfs_root *root,
1990                                struct btrfs_delayed_ref_node *node,
1991                                struct btrfs_delayed_extent_op *extent_op,
1992                                int insert_reserved)
1993 {
1994         int ret;
1995         if (btrfs_delayed_ref_is_head(node)) {
1996                 struct btrfs_delayed_ref_head *head;
1997                 /*
1998                  * we've hit the end of the chain and we were supposed
1999                  * to insert this extent into the tree.  But, it got
2000                  * deleted before we ever needed to insert it, so all
2001                  * we have to do is clean up the accounting
2002                  */
2003                 BUG_ON(extent_op);
2004                 head = btrfs_delayed_node_to_head(node);
2005                 if (insert_reserved) {
2006                         btrfs_pin_extent(root, node->bytenr,
2007                                          node->num_bytes, 1);
2008                         if (head->is_data) {
2009                                 ret = btrfs_del_csums(trans, root,
2010                                                       node->bytenr,
2011                                                       node->num_bytes);
2012                                 BUG_ON(ret);
2013                         }
2014                 }
2015                 mutex_unlock(&head->mutex);
2016                 return 0;
2017         }
2018
2019         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2020             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2021                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2022                                            insert_reserved);
2023         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2024                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2025                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2026                                            insert_reserved);
2027         else
2028                 BUG();
2029         return ret;
2030 }
2031
2032 static noinline struct btrfs_delayed_ref_node *
2033 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2034 {
2035         struct rb_node *node;
2036         struct btrfs_delayed_ref_node *ref;
2037         int action = BTRFS_ADD_DELAYED_REF;
2038 again:
2039         /*
2040          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2041          * this prevents ref count from going down to zero when
2042          * there still are pending delayed ref.
2043          */
2044         node = rb_prev(&head->node.rb_node);
2045         while (1) {
2046                 if (!node)
2047                         break;
2048                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2049                                 rb_node);
2050                 if (ref->bytenr != head->node.bytenr)
2051                         break;
2052                 if (ref->action == action)
2053                         return ref;
2054                 node = rb_prev(node);
2055         }
2056         if (action == BTRFS_ADD_DELAYED_REF) {
2057                 action = BTRFS_DROP_DELAYED_REF;
2058                 goto again;
2059         }
2060         return NULL;
2061 }
2062
2063 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2064                                        struct btrfs_root *root,
2065                                        struct list_head *cluster)
2066 {
2067         struct btrfs_delayed_ref_root *delayed_refs;
2068         struct btrfs_delayed_ref_node *ref;
2069         struct btrfs_delayed_ref_head *locked_ref = NULL;
2070         struct btrfs_delayed_extent_op *extent_op;
2071         int ret;
2072         int count = 0;
2073         int must_insert_reserved = 0;
2074
2075         delayed_refs = &trans->transaction->delayed_refs;
2076         while (1) {
2077                 if (!locked_ref) {
2078                         /* pick a new head ref from the cluster list */
2079                         if (list_empty(cluster))
2080                                 break;
2081
2082                         locked_ref = list_entry(cluster->next,
2083                                      struct btrfs_delayed_ref_head, cluster);
2084
2085                         /* grab the lock that says we are going to process
2086                          * all the refs for this head */
2087                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2088
2089                         /*
2090                          * we may have dropped the spin lock to get the head
2091                          * mutex lock, and that might have given someone else
2092                          * time to free the head.  If that's true, it has been
2093                          * removed from our list and we can move on.
2094                          */
2095                         if (ret == -EAGAIN) {
2096                                 locked_ref = NULL;
2097                                 count++;
2098                                 continue;
2099                         }
2100                 }
2101
2102                 /*
2103                  * record the must insert reserved flag before we
2104                  * drop the spin lock.
2105                  */
2106                 must_insert_reserved = locked_ref->must_insert_reserved;
2107                 locked_ref->must_insert_reserved = 0;
2108
2109                 extent_op = locked_ref->extent_op;
2110                 locked_ref->extent_op = NULL;
2111
2112                 /*
2113                  * locked_ref is the head node, so we have to go one
2114                  * node back for any delayed ref updates
2115                  */
2116                 ref = select_delayed_ref(locked_ref);
2117                 if (!ref) {
2118                         /* All delayed refs have been processed, Go ahead
2119                          * and send the head node to run_one_delayed_ref,
2120                          * so that any accounting fixes can happen
2121                          */
2122                         ref = &locked_ref->node;
2123
2124                         if (extent_op && must_insert_reserved) {
2125                                 kfree(extent_op);
2126                                 extent_op = NULL;
2127                         }
2128
2129                         if (extent_op) {
2130                                 spin_unlock(&delayed_refs->lock);
2131
2132                                 ret = run_delayed_extent_op(trans, root,
2133                                                             ref, extent_op);
2134                                 BUG_ON(ret);
2135                                 kfree(extent_op);
2136
2137                                 cond_resched();
2138                                 spin_lock(&delayed_refs->lock);
2139                                 continue;
2140                         }
2141
2142                         list_del_init(&locked_ref->cluster);
2143                         locked_ref = NULL;
2144                 }
2145
2146                 ref->in_tree = 0;
2147                 rb_erase(&ref->rb_node, &delayed_refs->root);
2148                 delayed_refs->num_entries--;
2149
2150                 spin_unlock(&delayed_refs->lock);
2151
2152                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2153                                           must_insert_reserved);
2154                 BUG_ON(ret);
2155
2156                 btrfs_put_delayed_ref(ref);
2157                 kfree(extent_op);
2158                 count++;
2159
2160                 cond_resched();
2161                 spin_lock(&delayed_refs->lock);
2162         }
2163         return count;
2164 }
2165
2166 /*
2167  * this starts processing the delayed reference count updates and
2168  * extent insertions we have queued up so far.  count can be
2169  * 0, which means to process everything in the tree at the start
2170  * of the run (but not newly added entries), or it can be some target
2171  * number you'd like to process.
2172  */
2173 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2174                            struct btrfs_root *root, unsigned long count)
2175 {
2176         struct rb_node *node;
2177         struct btrfs_delayed_ref_root *delayed_refs;
2178         struct btrfs_delayed_ref_node *ref;
2179         struct list_head cluster;
2180         int ret;
2181         int run_all = count == (unsigned long)-1;
2182         int run_most = 0;
2183
2184         if (root == root->fs_info->extent_root)
2185                 root = root->fs_info->tree_root;
2186
2187         delayed_refs = &trans->transaction->delayed_refs;
2188         INIT_LIST_HEAD(&cluster);
2189 again:
2190         spin_lock(&delayed_refs->lock);
2191         if (count == 0) {
2192                 count = delayed_refs->num_entries * 2;
2193                 run_most = 1;
2194         }
2195         while (1) {
2196                 if (!(run_all || run_most) &&
2197                     delayed_refs->num_heads_ready < 64)
2198                         break;
2199
2200                 /*
2201                  * go find something we can process in the rbtree.  We start at
2202                  * the beginning of the tree, and then build a cluster
2203                  * of refs to process starting at the first one we are able to
2204                  * lock
2205                  */
2206                 ret = btrfs_find_ref_cluster(trans, &cluster,
2207                                              delayed_refs->run_delayed_start);
2208                 if (ret)
2209                         break;
2210
2211                 ret = run_clustered_refs(trans, root, &cluster);
2212                 BUG_ON(ret < 0);
2213
2214                 count -= min_t(unsigned long, ret, count);
2215
2216                 if (count == 0)
2217                         break;
2218         }
2219
2220         if (run_all) {
2221                 node = rb_first(&delayed_refs->root);
2222                 if (!node)
2223                         goto out;
2224                 count = (unsigned long)-1;
2225
2226                 while (node) {
2227                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2228                                        rb_node);
2229                         if (btrfs_delayed_ref_is_head(ref)) {
2230                                 struct btrfs_delayed_ref_head *head;
2231
2232                                 head = btrfs_delayed_node_to_head(ref);
2233                                 atomic_inc(&ref->refs);
2234
2235                                 spin_unlock(&delayed_refs->lock);
2236                                 mutex_lock(&head->mutex);
2237                                 mutex_unlock(&head->mutex);
2238
2239                                 btrfs_put_delayed_ref(ref);
2240                                 cond_resched();
2241                                 goto again;
2242                         }
2243                         node = rb_next(node);
2244                 }
2245                 spin_unlock(&delayed_refs->lock);
2246                 schedule_timeout(1);
2247                 goto again;
2248         }
2249 out:
2250         spin_unlock(&delayed_refs->lock);
2251         return 0;
2252 }
2253
2254 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2255                                 struct btrfs_root *root,
2256                                 u64 bytenr, u64 num_bytes, u64 flags,
2257                                 int is_data)
2258 {
2259         struct btrfs_delayed_extent_op *extent_op;
2260         int ret;
2261
2262         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2263         if (!extent_op)
2264                 return -ENOMEM;
2265
2266         extent_op->flags_to_set = flags;
2267         extent_op->update_flags = 1;
2268         extent_op->update_key = 0;
2269         extent_op->is_data = is_data ? 1 : 0;
2270
2271         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2272         if (ret)
2273                 kfree(extent_op);
2274         return ret;
2275 }
2276
2277 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2278                                       struct btrfs_root *root,
2279                                       struct btrfs_path *path,
2280                                       u64 objectid, u64 offset, u64 bytenr)
2281 {
2282         struct btrfs_delayed_ref_head *head;
2283         struct btrfs_delayed_ref_node *ref;
2284         struct btrfs_delayed_data_ref *data_ref;
2285         struct btrfs_delayed_ref_root *delayed_refs;
2286         struct rb_node *node;
2287         int ret = 0;
2288
2289         ret = -ENOENT;
2290         delayed_refs = &trans->transaction->delayed_refs;
2291         spin_lock(&delayed_refs->lock);
2292         head = btrfs_find_delayed_ref_head(trans, bytenr);
2293         if (!head)
2294                 goto out;
2295
2296         if (!mutex_trylock(&head->mutex)) {
2297                 atomic_inc(&head->node.refs);
2298                 spin_unlock(&delayed_refs->lock);
2299
2300                 btrfs_release_path(root->fs_info->extent_root, path);
2301
2302                 mutex_lock(&head->mutex);
2303                 mutex_unlock(&head->mutex);
2304                 btrfs_put_delayed_ref(&head->node);
2305                 return -EAGAIN;
2306         }
2307
2308         node = rb_prev(&head->node.rb_node);
2309         if (!node)
2310                 goto out_unlock;
2311
2312         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2313
2314         if (ref->bytenr != bytenr)
2315                 goto out_unlock;
2316
2317         ret = 1;
2318         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2319                 goto out_unlock;
2320
2321         data_ref = btrfs_delayed_node_to_data_ref(ref);
2322
2323         node = rb_prev(node);
2324         if (node) {
2325                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2326                 if (ref->bytenr == bytenr)
2327                         goto out_unlock;
2328         }
2329
2330         if (data_ref->root != root->root_key.objectid ||
2331             data_ref->objectid != objectid || data_ref->offset != offset)
2332                 goto out_unlock;
2333
2334         ret = 0;
2335 out_unlock:
2336         mutex_unlock(&head->mutex);
2337 out:
2338         spin_unlock(&delayed_refs->lock);
2339         return ret;
2340 }
2341
2342 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2343                                         struct btrfs_root *root,
2344                                         struct btrfs_path *path,
2345                                         u64 objectid, u64 offset, u64 bytenr)
2346 {
2347         struct btrfs_root *extent_root = root->fs_info->extent_root;
2348         struct extent_buffer *leaf;
2349         struct btrfs_extent_data_ref *ref;
2350         struct btrfs_extent_inline_ref *iref;
2351         struct btrfs_extent_item *ei;
2352         struct btrfs_key key;
2353         u32 item_size;
2354         int ret;
2355
2356         key.objectid = bytenr;
2357         key.offset = (u64)-1;
2358         key.type = BTRFS_EXTENT_ITEM_KEY;
2359
2360         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2361         if (ret < 0)
2362                 goto out;
2363         BUG_ON(ret == 0);
2364
2365         ret = -ENOENT;
2366         if (path->slots[0] == 0)
2367                 goto out;
2368
2369         path->slots[0]--;
2370         leaf = path->nodes[0];
2371         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2372
2373         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2374                 goto out;
2375
2376         ret = 1;
2377         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2378 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2379         if (item_size < sizeof(*ei)) {
2380                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2381                 goto out;
2382         }
2383 #endif
2384         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2385
2386         if (item_size != sizeof(*ei) +
2387             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2388                 goto out;
2389
2390         if (btrfs_extent_generation(leaf, ei) <=
2391             btrfs_root_last_snapshot(&root->root_item))
2392                 goto out;
2393
2394         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2395         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2396             BTRFS_EXTENT_DATA_REF_KEY)
2397                 goto out;
2398
2399         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2400         if (btrfs_extent_refs(leaf, ei) !=
2401             btrfs_extent_data_ref_count(leaf, ref) ||
2402             btrfs_extent_data_ref_root(leaf, ref) !=
2403             root->root_key.objectid ||
2404             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2405             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2406                 goto out;
2407
2408         ret = 0;
2409 out:
2410         return ret;
2411 }
2412
2413 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2414                           struct btrfs_root *root,
2415                           u64 objectid, u64 offset, u64 bytenr)
2416 {
2417         struct btrfs_path *path;
2418         int ret;
2419         int ret2;
2420
2421         path = btrfs_alloc_path();
2422         if (!path)
2423                 return -ENOENT;
2424
2425         do {
2426                 ret = check_committed_ref(trans, root, path, objectid,
2427                                           offset, bytenr);
2428                 if (ret && ret != -ENOENT)
2429                         goto out;
2430
2431                 ret2 = check_delayed_ref(trans, root, path, objectid,
2432                                          offset, bytenr);
2433         } while (ret2 == -EAGAIN);
2434
2435         if (ret2 && ret2 != -ENOENT) {
2436                 ret = ret2;
2437                 goto out;
2438         }
2439
2440         if (ret != -ENOENT || ret2 != -ENOENT)
2441                 ret = 0;
2442 out:
2443         btrfs_free_path(path);
2444         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2445                 WARN_ON(ret > 0);
2446         return ret;
2447 }
2448
2449 #if 0
2450 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2451                     struct extent_buffer *buf, u32 nr_extents)
2452 {
2453         struct btrfs_key key;
2454         struct btrfs_file_extent_item *fi;
2455         u64 root_gen;
2456         u32 nritems;
2457         int i;
2458         int level;
2459         int ret = 0;
2460         int shared = 0;
2461
2462         if (!root->ref_cows)
2463                 return 0;
2464
2465         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2466                 shared = 0;
2467                 root_gen = root->root_key.offset;
2468         } else {
2469                 shared = 1;
2470                 root_gen = trans->transid - 1;
2471         }
2472
2473         level = btrfs_header_level(buf);
2474         nritems = btrfs_header_nritems(buf);
2475
2476         if (level == 0) {
2477                 struct btrfs_leaf_ref *ref;
2478                 struct btrfs_extent_info *info;
2479
2480                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2481                 if (!ref) {
2482                         ret = -ENOMEM;
2483                         goto out;
2484                 }
2485
2486                 ref->root_gen = root_gen;
2487                 ref->bytenr = buf->start;
2488                 ref->owner = btrfs_header_owner(buf);
2489                 ref->generation = btrfs_header_generation(buf);
2490                 ref->nritems = nr_extents;
2491                 info = ref->extents;
2492
2493                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2494                         u64 disk_bytenr;
2495                         btrfs_item_key_to_cpu(buf, &key, i);
2496                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2497                                 continue;
2498                         fi = btrfs_item_ptr(buf, i,
2499                                             struct btrfs_file_extent_item);
2500                         if (btrfs_file_extent_type(buf, fi) ==
2501                             BTRFS_FILE_EXTENT_INLINE)
2502                                 continue;
2503                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2504                         if (disk_bytenr == 0)
2505                                 continue;
2506
2507                         info->bytenr = disk_bytenr;
2508                         info->num_bytes =
2509                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2510                         info->objectid = key.objectid;
2511                         info->offset = key.offset;
2512                         info++;
2513                 }
2514
2515                 ret = btrfs_add_leaf_ref(root, ref, shared);
2516                 if (ret == -EEXIST && shared) {
2517                         struct btrfs_leaf_ref *old;
2518                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2519                         BUG_ON(!old);
2520                         btrfs_remove_leaf_ref(root, old);
2521                         btrfs_free_leaf_ref(root, old);
2522                         ret = btrfs_add_leaf_ref(root, ref, shared);
2523                 }
2524                 WARN_ON(ret);
2525                 btrfs_free_leaf_ref(root, ref);
2526         }
2527 out:
2528         return ret;
2529 }
2530
2531 /* when a block goes through cow, we update the reference counts of
2532  * everything that block points to.  The internal pointers of the block
2533  * can be in just about any order, and it is likely to have clusters of
2534  * things that are close together and clusters of things that are not.
2535  *
2536  * To help reduce the seeks that come with updating all of these reference
2537  * counts, sort them by byte number before actual updates are done.
2538  *
2539  * struct refsort is used to match byte number to slot in the btree block.
2540  * we sort based on the byte number and then use the slot to actually
2541  * find the item.
2542  *
2543  * struct refsort is smaller than strcut btrfs_item and smaller than
2544  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2545  * for a btree block, there's no way for a kmalloc of refsorts for a
2546  * single node to be bigger than a page.
2547  */
2548 struct refsort {
2549         u64 bytenr;
2550         u32 slot;
2551 };
2552
2553 /*
2554  * for passing into sort()
2555  */
2556 static int refsort_cmp(const void *a_void, const void *b_void)
2557 {
2558         const struct refsort *a = a_void;
2559         const struct refsort *b = b_void;
2560
2561         if (a->bytenr < b->bytenr)
2562                 return -1;
2563         if (a->bytenr > b->bytenr)
2564                 return 1;
2565         return 0;
2566 }
2567 #endif
2568
2569 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2570                            struct btrfs_root *root,
2571                            struct extent_buffer *buf,
2572                            int full_backref, int inc)
2573 {
2574         u64 bytenr;
2575         u64 num_bytes;
2576         u64 parent;
2577         u64 ref_root;
2578         u32 nritems;
2579         struct btrfs_key key;
2580         struct btrfs_file_extent_item *fi;
2581         int i;
2582         int level;
2583         int ret = 0;
2584         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2585                             u64, u64, u64, u64, u64, u64);
2586
2587         ref_root = btrfs_header_owner(buf);
2588         nritems = btrfs_header_nritems(buf);
2589         level = btrfs_header_level(buf);
2590
2591         if (!root->ref_cows && level == 0)
2592                 return 0;
2593
2594         if (inc)
2595                 process_func = btrfs_inc_extent_ref;
2596         else
2597                 process_func = btrfs_free_extent;
2598
2599         if (full_backref)
2600                 parent = buf->start;
2601         else
2602                 parent = 0;
2603
2604         for (i = 0; i < nritems; i++) {
2605                 if (level == 0) {
2606                         btrfs_item_key_to_cpu(buf, &key, i);
2607                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2608                                 continue;
2609                         fi = btrfs_item_ptr(buf, i,
2610                                             struct btrfs_file_extent_item);
2611                         if (btrfs_file_extent_type(buf, fi) ==
2612                             BTRFS_FILE_EXTENT_INLINE)
2613                                 continue;
2614                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2615                         if (bytenr == 0)
2616                                 continue;
2617
2618                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2619                         key.offset -= btrfs_file_extent_offset(buf, fi);
2620                         ret = process_func(trans, root, bytenr, num_bytes,
2621                                            parent, ref_root, key.objectid,
2622                                            key.offset);
2623                         if (ret)
2624                                 goto fail;
2625                 } else {
2626                         bytenr = btrfs_node_blockptr(buf, i);
2627                         num_bytes = btrfs_level_size(root, level - 1);
2628                         ret = process_func(trans, root, bytenr, num_bytes,
2629                                            parent, ref_root, level - 1, 0);
2630                         if (ret)
2631                                 goto fail;
2632                 }
2633         }
2634         return 0;
2635 fail:
2636         BUG();
2637         return ret;
2638 }
2639
2640 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2641                   struct extent_buffer *buf, int full_backref)
2642 {
2643         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2644 }
2645
2646 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2647                   struct extent_buffer *buf, int full_backref)
2648 {
2649         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2650 }
2651
2652 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2653                                  struct btrfs_root *root,
2654                                  struct btrfs_path *path,
2655                                  struct btrfs_block_group_cache *cache)
2656 {
2657         int ret;
2658         struct btrfs_root *extent_root = root->fs_info->extent_root;
2659         unsigned long bi;
2660         struct extent_buffer *leaf;
2661
2662         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2663         if (ret < 0)
2664                 goto fail;
2665         BUG_ON(ret);
2666
2667         leaf = path->nodes[0];
2668         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2669         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2670         btrfs_mark_buffer_dirty(leaf);
2671         btrfs_release_path(extent_root, path);
2672 fail:
2673         if (ret)
2674                 return ret;
2675         return 0;
2676
2677 }
2678
2679 static struct btrfs_block_group_cache *
2680 next_block_group(struct btrfs_root *root,
2681                  struct btrfs_block_group_cache *cache)
2682 {
2683         struct rb_node *node;
2684         spin_lock(&root->fs_info->block_group_cache_lock);
2685         node = rb_next(&cache->cache_node);
2686         btrfs_put_block_group(cache);
2687         if (node) {
2688                 cache = rb_entry(node, struct btrfs_block_group_cache,
2689                                  cache_node);
2690                 btrfs_get_block_group(cache);
2691         } else
2692                 cache = NULL;
2693         spin_unlock(&root->fs_info->block_group_cache_lock);
2694         return cache;
2695 }
2696
2697 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2698                                    struct btrfs_root *root)
2699 {
2700         struct btrfs_block_group_cache *cache;
2701         int err = 0;
2702         struct btrfs_path *path;
2703         u64 last = 0;
2704
2705         path = btrfs_alloc_path();
2706         if (!path)
2707                 return -ENOMEM;
2708
2709         while (1) {
2710                 if (last == 0) {
2711                         err = btrfs_run_delayed_refs(trans, root,
2712                                                      (unsigned long)-1);
2713                         BUG_ON(err);
2714                 }
2715
2716                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2717                 while (cache) {
2718                         if (cache->dirty)
2719                                 break;
2720                         cache = next_block_group(root, cache);
2721                 }
2722                 if (!cache) {
2723                         if (last == 0)
2724                                 break;
2725                         last = 0;
2726                         continue;
2727                 }
2728
2729                 cache->dirty = 0;
2730                 last = cache->key.objectid + cache->key.offset;
2731
2732                 err = write_one_cache_group(trans, root, path, cache);
2733                 BUG_ON(err);
2734                 btrfs_put_block_group(cache);
2735         }
2736
2737         btrfs_free_path(path);
2738         return 0;
2739 }
2740
2741 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2742 {
2743         struct btrfs_block_group_cache *block_group;
2744         int readonly = 0;
2745
2746         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2747         if (!block_group || block_group->ro)
2748                 readonly = 1;
2749         if (block_group)
2750                 btrfs_put_block_group(block_group);
2751         return readonly;
2752 }
2753
2754 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2755                              u64 total_bytes, u64 bytes_used,
2756                              struct btrfs_space_info **space_info)
2757 {
2758         struct btrfs_space_info *found;
2759         int i;
2760         int factor;
2761
2762         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2763                      BTRFS_BLOCK_GROUP_RAID10))
2764                 factor = 2;
2765         else
2766                 factor = 1;
2767
2768         found = __find_space_info(info, flags);
2769         if (found) {
2770                 spin_lock(&found->lock);
2771                 found->total_bytes += total_bytes;
2772                 found->bytes_used += bytes_used;
2773                 found->disk_used += bytes_used * factor;
2774                 found->full = 0;
2775                 spin_unlock(&found->lock);
2776                 *space_info = found;
2777                 return 0;
2778         }
2779         found = kzalloc(sizeof(*found), GFP_NOFS);
2780         if (!found)
2781                 return -ENOMEM;
2782
2783         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2784                 INIT_LIST_HEAD(&found->block_groups[i]);
2785         init_rwsem(&found->groups_sem);
2786         spin_lock_init(&found->lock);
2787         found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2788                                 BTRFS_BLOCK_GROUP_SYSTEM |
2789                                 BTRFS_BLOCK_GROUP_METADATA);
2790         found->total_bytes = total_bytes;
2791         found->bytes_used = bytes_used;
2792         found->disk_used = bytes_used * factor;
2793         found->bytes_pinned = 0;
2794         found->bytes_reserved = 0;
2795         found->bytes_readonly = 0;
2796         found->bytes_may_use = 0;
2797         found->full = 0;
2798         found->force_alloc = 0;
2799         *space_info = found;
2800         list_add_rcu(&found->list, &info->space_info);
2801         atomic_set(&found->caching_threads, 0);
2802         return 0;
2803 }
2804
2805 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2806 {
2807         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2808                                    BTRFS_BLOCK_GROUP_RAID1 |
2809                                    BTRFS_BLOCK_GROUP_RAID10 |
2810                                    BTRFS_BLOCK_GROUP_DUP);
2811         if (extra_flags) {
2812                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2813                         fs_info->avail_data_alloc_bits |= extra_flags;
2814                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2815                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2816                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2817                         fs_info->avail_system_alloc_bits |= extra_flags;
2818         }
2819 }
2820
2821 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2822 {
2823         u64 num_devices = root->fs_info->fs_devices->rw_devices;
2824
2825         if (num_devices == 1)
2826                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2827         if (num_devices < 4)
2828                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2829
2830         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2831             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2832                       BTRFS_BLOCK_GROUP_RAID10))) {
2833                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2834         }
2835
2836         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2837             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2838                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2839         }
2840
2841         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2842             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2843              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2844              (flags & BTRFS_BLOCK_GROUP_DUP)))
2845                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2846         return flags;
2847 }
2848
2849 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2850 {
2851         if (flags & BTRFS_BLOCK_GROUP_DATA)
2852                 flags |= root->fs_info->avail_data_alloc_bits &
2853                          root->fs_info->data_alloc_profile;
2854         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2855                 flags |= root->fs_info->avail_system_alloc_bits &
2856                          root->fs_info->system_alloc_profile;
2857         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2858                 flags |= root->fs_info->avail_metadata_alloc_bits &
2859                          root->fs_info->metadata_alloc_profile;
2860         return btrfs_reduce_alloc_profile(root, flags);
2861 }
2862
2863 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2864 {
2865         u64 flags;
2866
2867         if (data)
2868                 flags = BTRFS_BLOCK_GROUP_DATA;
2869         else if (root == root->fs_info->chunk_root)
2870                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2871         else
2872                 flags = BTRFS_BLOCK_GROUP_METADATA;
2873
2874         return get_alloc_profile(root, flags);
2875 }
2876
2877 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2878 {
2879         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2880                                                        BTRFS_BLOCK_GROUP_DATA);
2881 }
2882
2883 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2884 {
2885         u64 num_bytes;
2886         int level;
2887
2888         level = BTRFS_MAX_LEVEL - 2;
2889         /*
2890          * NOTE: these calculations are absolutely the worst possible case.
2891          * This assumes that _every_ item we insert will require a new leaf, and
2892          * that the tree has grown to its maximum level size.
2893          */
2894
2895         /*
2896          * for every item we insert we could insert both an extent item and a
2897          * extent ref item.  Then for ever item we insert, we will need to cow
2898          * both the original leaf, plus the leaf to the left and right of it.
2899          *
2900          * Unless we are talking about the extent root, then we just want the
2901          * number of items * 2, since we just need the extent item plus its ref.
2902          */
2903         if (root == root->fs_info->extent_root)
2904                 num_bytes = num_items * 2;
2905         else
2906                 num_bytes = (num_items + (2 * num_items)) * 3;
2907
2908         /*
2909          * num_bytes is total number of leaves we could need times the leaf
2910          * size, and then for every leaf we could end up cow'ing 2 nodes per
2911          * level, down to the leaf level.
2912          */
2913         num_bytes = (num_bytes * root->leafsize) +
2914                 (num_bytes * (level * 2)) * root->nodesize;
2915
2916         return num_bytes;
2917 }
2918
2919 /*
2920  * Unreserve metadata space for delalloc.  If we have less reserved credits than
2921  * we have extents, this function does nothing.
2922  */
2923 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2924                                           struct inode *inode, int num_items)
2925 {
2926         struct btrfs_fs_info *info = root->fs_info;
2927         struct btrfs_space_info *meta_sinfo;
2928         u64 num_bytes;
2929         u64 alloc_target;
2930         bool bug = false;
2931
2932         /* get the space info for where the metadata will live */
2933         alloc_target = btrfs_get_alloc_profile(root, 0);
2934         meta_sinfo = __find_space_info(info, alloc_target);
2935
2936         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2937                                            num_items);
2938
2939         spin_lock(&meta_sinfo->lock);
2940         spin_lock(&BTRFS_I(inode)->accounting_lock);
2941         if (BTRFS_I(inode)->reserved_extents <=
2942             BTRFS_I(inode)->outstanding_extents) {
2943                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2944                 spin_unlock(&meta_sinfo->lock);
2945                 return 0;
2946         }
2947         spin_unlock(&BTRFS_I(inode)->accounting_lock);
2948
2949         BTRFS_I(inode)->reserved_extents -= num_items;
2950         BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2951
2952         if (meta_sinfo->bytes_delalloc < num_bytes) {
2953                 bug = true;
2954                 meta_sinfo->bytes_delalloc = 0;
2955         } else {
2956                 meta_sinfo->bytes_delalloc -= num_bytes;
2957         }
2958         spin_unlock(&meta_sinfo->lock);
2959
2960         BUG_ON(bug);
2961
2962         return 0;
2963 }
2964
2965 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2966 {
2967         u64 thresh;
2968
2969         thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2970                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2971                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2972                 meta_sinfo->bytes_may_use;
2973
2974         thresh = meta_sinfo->total_bytes - thresh;
2975         thresh *= 80;
2976         do_div(thresh, 100);
2977         if (thresh <= meta_sinfo->bytes_delalloc)
2978                 meta_sinfo->force_delalloc = 1;
2979         else
2980                 meta_sinfo->force_delalloc = 0;
2981 }
2982
2983 /*
2984  * Reserve metadata space for delalloc.
2985  */
2986 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2987                                         struct inode *inode, int num_items)
2988 {
2989         struct btrfs_fs_info *info = root->fs_info;
2990         struct btrfs_space_info *meta_sinfo;
2991         u64 num_bytes;
2992         u64 used;
2993         u64 alloc_target;
2994         int flushed = 0;
2995         int force_delalloc;
2996
2997         /* get the space info for where the metadata will live */
2998         alloc_target = btrfs_get_alloc_profile(root, 0);
2999         meta_sinfo = __find_space_info(info, alloc_target);
3000
3001         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3002                                            num_items);
3003 again:
3004         spin_lock(&meta_sinfo->lock);
3005
3006         force_delalloc = meta_sinfo->force_delalloc;
3007
3008         if (unlikely(!meta_sinfo->bytes_root))
3009                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3010
3011         if (!flushed)
3012                 meta_sinfo->bytes_delalloc += num_bytes;
3013
3014         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3015                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3016                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3017                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3018
3019         if (used > meta_sinfo->total_bytes) {
3020                 flushed++;
3021
3022                 if (flushed == 1) {
3023                         if (maybe_allocate_chunk(NULL, root, meta_sinfo,
3024                                                  num_bytes))
3025                                 goto again;
3026                         flushed++;
3027                 } else {
3028                         spin_unlock(&meta_sinfo->lock);
3029                 }
3030
3031                 if (flushed == 2) {
3032                         filemap_flush(inode->i_mapping);
3033                         goto again;
3034                 } else if (flushed == 3) {
3035                         shrink_delalloc(NULL, root, meta_sinfo, num_bytes);
3036                         goto again;
3037                 }
3038                 spin_lock(&meta_sinfo->lock);
3039                 meta_sinfo->bytes_delalloc -= num_bytes;
3040                 spin_unlock(&meta_sinfo->lock);
3041                 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3042                        BTRFS_I(inode)->outstanding_extents,
3043                        BTRFS_I(inode)->reserved_extents);
3044                 dump_space_info(meta_sinfo, 0, 0);
3045                 return -ENOSPC;
3046         }
3047
3048         BTRFS_I(inode)->reserved_extents += num_items;
3049         check_force_delalloc(meta_sinfo);
3050         spin_unlock(&meta_sinfo->lock);
3051
3052         if (!flushed && force_delalloc)
3053                 filemap_flush(inode->i_mapping);
3054
3055         return 0;
3056 }
3057
3058 /*
3059  * This will check the space that the inode allocates from to make sure we have
3060  * enough space for bytes.
3061  */
3062 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3063                                 u64 bytes)
3064 {
3065         struct btrfs_space_info *data_sinfo;
3066         u64 used;
3067         int ret = 0, committed = 0;
3068
3069         /* make sure bytes are sectorsize aligned */
3070         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3071
3072         data_sinfo = BTRFS_I(inode)->space_info;
3073         if (!data_sinfo)
3074                 goto alloc;
3075
3076 again:
3077         /* make sure we have enough space to handle the data first */
3078         spin_lock(&data_sinfo->lock);
3079         used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3080                 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3081                 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3082                 data_sinfo->bytes_super;
3083
3084         if (used + bytes > data_sinfo->total_bytes) {
3085                 struct btrfs_trans_handle *trans;
3086
3087                 /*
3088                  * if we don't have enough free bytes in this space then we need
3089                  * to alloc a new chunk.
3090                  */
3091                 if (!data_sinfo->full) {
3092                         u64 alloc_target;
3093
3094                         data_sinfo->force_alloc = 1;
3095                         spin_unlock(&data_sinfo->lock);
3096 alloc:
3097                         alloc_target = btrfs_get_alloc_profile(root, 1);
3098                         trans = btrfs_join_transaction(root, 1);
3099                         if (IS_ERR(trans))
3100                                 return PTR_ERR(trans);
3101
3102                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3103                                              bytes + 2 * 1024 * 1024,
3104                                              alloc_target, 0);
3105                         btrfs_end_transaction(trans, root);
3106                         if (ret)
3107                                 return ret;
3108
3109                         if (!data_sinfo) {
3110                                 btrfs_set_inode_space_info(root, inode);
3111                                 data_sinfo = BTRFS_I(inode)->space_info;
3112                         }
3113                         goto again;
3114                 }
3115                 spin_unlock(&data_sinfo->lock);
3116
3117                 /* commit the current transaction and try again */
3118                 if (!committed && !root->fs_info->open_ioctl_trans) {
3119                         committed = 1;
3120                         trans = btrfs_join_transaction(root, 1);
3121                         if (IS_ERR(trans))
3122                                 return PTR_ERR(trans);
3123                         ret = btrfs_commit_transaction(trans, root);
3124                         if (ret)
3125                                 return ret;
3126                         goto again;
3127                 }
3128
3129                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3130                        ", %llu bytes_used, %llu bytes_reserved, "
3131                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
3132                        "%llu total\n", (unsigned long long)bytes,
3133                        (unsigned long long)data_sinfo->bytes_delalloc,
3134                        (unsigned long long)data_sinfo->bytes_used,
3135                        (unsigned long long)data_sinfo->bytes_reserved,
3136                        (unsigned long long)data_sinfo->bytes_pinned,
3137                        (unsigned long long)data_sinfo->bytes_readonly,
3138                        (unsigned long long)data_sinfo->bytes_may_use,
3139                        (unsigned long long)data_sinfo->total_bytes);
3140                 return -ENOSPC;
3141         }
3142         data_sinfo->bytes_may_use += bytes;
3143         BTRFS_I(inode)->reserved_bytes += bytes;
3144         spin_unlock(&data_sinfo->lock);
3145
3146         return 0;
3147 }
3148
3149 /*
3150  * if there was an error for whatever reason after calling
3151  * btrfs_check_data_free_space, call this so we can cleanup the counters.
3152  */
3153 void btrfs_free_reserved_data_space(struct btrfs_root *root,
3154                                     struct inode *inode, u64 bytes)
3155 {
3156         struct btrfs_space_info *data_sinfo;
3157
3158         /* make sure bytes are sectorsize aligned */
3159         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3160
3161         data_sinfo = BTRFS_I(inode)->space_info;
3162         spin_lock(&data_sinfo->lock);
3163         data_sinfo->bytes_may_use -= bytes;
3164         BTRFS_I(inode)->reserved_bytes -= bytes;
3165         spin_unlock(&data_sinfo->lock);
3166 }
3167
3168 /* called when we are adding a delalloc extent to the inode's io_tree */
3169 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3170                                   u64 bytes)
3171 {
3172         struct btrfs_space_info *data_sinfo;
3173
3174         /* get the space info for where this inode will be storing its data */
3175         data_sinfo = BTRFS_I(inode)->space_info;
3176
3177         /* make sure we have enough space to handle the data first */
3178         spin_lock(&data_sinfo->lock);
3179         data_sinfo->bytes_delalloc += bytes;
3180
3181         /*
3182          * we are adding a delalloc extent without calling
3183          * btrfs_check_data_free_space first.  This happens on a weird
3184          * writepage condition, but shouldn't hurt our accounting
3185          */
3186         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3187                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3188                 BTRFS_I(inode)->reserved_bytes = 0;
3189         } else {
3190                 data_sinfo->bytes_may_use -= bytes;
3191                 BTRFS_I(inode)->reserved_bytes -= bytes;
3192         }
3193
3194         spin_unlock(&data_sinfo->lock);
3195 }
3196
3197 /* called when we are clearing an delalloc extent from the inode's io_tree */
3198 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3199                               u64 bytes)
3200 {
3201         struct btrfs_space_info *info;
3202
3203         info = BTRFS_I(inode)->space_info;
3204
3205         spin_lock(&info->lock);
3206         info->bytes_delalloc -= bytes;
3207         spin_unlock(&info->lock);
3208 }
3209
3210 static void force_metadata_allocation(struct btrfs_fs_info *info)
3211 {
3212         struct list_head *head = &info->space_info;
3213         struct btrfs_space_info *found;
3214
3215         rcu_read_lock();
3216         list_for_each_entry_rcu(found, head, list) {
3217                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3218                         found->force_alloc = 1;
3219         }
3220         rcu_read_unlock();
3221 }
3222
3223 static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3224                               u64 alloc_bytes)
3225 {
3226         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3227
3228         if (sinfo->bytes_used + sinfo->bytes_reserved +
3229             alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3230                 return 0;
3231
3232         if (sinfo->bytes_used + sinfo->bytes_reserved +
3233             alloc_bytes < div_factor(num_bytes, 8))
3234                 return 0;
3235
3236         return 1;
3237 }
3238
3239 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3240                           struct btrfs_root *extent_root, u64 alloc_bytes,
3241                           u64 flags, int force)
3242 {
3243         struct btrfs_space_info *space_info;
3244         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3245         int ret = 0;
3246
3247         mutex_lock(&fs_info->chunk_mutex);
3248
3249         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3250
3251         space_info = __find_space_info(extent_root->fs_info, flags);
3252         if (!space_info) {
3253                 ret = update_space_info(extent_root->fs_info, flags,
3254                                         0, 0, &space_info);
3255                 BUG_ON(ret);
3256         }
3257         BUG_ON(!space_info);
3258
3259         spin_lock(&space_info->lock);
3260         if (space_info->force_alloc)
3261                 force = 1;
3262         if (space_info->full) {
3263                 spin_unlock(&space_info->lock);
3264                 goto out;
3265         }
3266
3267         if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3268                 spin_unlock(&space_info->lock);
3269                 goto out;
3270         }
3271         spin_unlock(&space_info->lock);
3272
3273         /*
3274          * if we're doing a data chunk, go ahead and make sure that
3275          * we keep a reasonable number of metadata chunks allocated in the
3276          * FS as well.
3277          */
3278         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3279                 fs_info->data_chunk_allocations++;
3280                 if (!(fs_info->data_chunk_allocations %
3281                       fs_info->metadata_ratio))
3282                         force_metadata_allocation(fs_info);
3283         }
3284
3285         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3286         spin_lock(&space_info->lock);
3287         if (ret)
3288                 space_info->full = 1;
3289         else
3290                 ret = 1;
3291         space_info->force_alloc = 0;
3292         spin_unlock(&space_info->lock);
3293 out:
3294         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3295         return ret;
3296 }
3297
3298 static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3299                                 struct btrfs_root *root,
3300                                 struct btrfs_space_info *sinfo, u64 num_bytes)
3301 {
3302         int ret;
3303         int end_trans = 0;
3304
3305         if (sinfo->full)
3306                 return 0;
3307
3308         spin_lock(&sinfo->lock);
3309         ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3310         spin_unlock(&sinfo->lock);
3311         if (!ret)
3312                 return 0;
3313
3314         if (!trans) {
3315                 trans = btrfs_join_transaction(root, 1);
3316                 BUG_ON(IS_ERR(trans));
3317                 end_trans = 1;
3318         }
3319
3320         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3321                              num_bytes + 2 * 1024 * 1024,
3322                              get_alloc_profile(root, sinfo->flags), 0);
3323
3324         if (end_trans)
3325                 btrfs_end_transaction(trans, root);
3326
3327         return ret == 1 ? 1 : 0;
3328 }
3329
3330 /*
3331  * shrink metadata reservation for delalloc
3332  */
3333 static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334                            struct btrfs_root *root,
3335                            struct btrfs_space_info *sinfo, u64 to_reclaim)
3336 {
3337         u64 reserved;
3338         u64 max_reclaim;
3339         u64 reclaimed = 0;
3340         int pause = 1;
3341         int ret;
3342
3343         spin_lock(&sinfo->lock);
3344         reserved = sinfo->bytes_delalloc;
3345         spin_unlock(&sinfo->lock);
3346
3347         if (reserved == 0)
3348                 return 0;
3349
3350         max_reclaim = min(reserved, to_reclaim);
3351
3352         while (1) {
3353                 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3354                 if (!ret) {
3355                         __set_current_state(TASK_INTERRUPTIBLE);
3356                         schedule_timeout(pause);
3357                         pause <<= 1;
3358                         if (pause > HZ / 10)
3359                                 pause = HZ / 10;
3360                 } else {
3361                         pause = 1;
3362                 }
3363
3364                 spin_lock(&sinfo->lock);
3365                 if (reserved > sinfo->bytes_delalloc)
3366                         reclaimed = reserved - sinfo->bytes_delalloc;
3367                 reserved = sinfo->bytes_delalloc;
3368                 spin_unlock(&sinfo->lock);
3369
3370                 if (reserved == 0 || reclaimed >= max_reclaim)
3371                         break;
3372
3373                 if (trans && trans->transaction->blocked)
3374                         return -EAGAIN;
3375         }
3376         return reclaimed >= to_reclaim;
3377 }
3378
3379 static int should_retry_reserve(struct btrfs_trans_handle *trans,
3380                                 struct btrfs_root *root,
3381                                 struct btrfs_block_rsv *block_rsv,
3382                                 u64 num_bytes, int *retries)
3383 {
3384         struct btrfs_space_info *space_info = block_rsv->space_info;
3385         int ret;
3386
3387         if ((*retries) > 2)
3388                 return -ENOSPC;
3389
3390         ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3391         if (ret)
3392                 return 1;
3393
3394         if (trans && trans->transaction->in_commit)
3395                 return -ENOSPC;
3396
3397         ret = shrink_delalloc(trans, root, space_info, num_bytes);
3398         if (ret)
3399                 return ret;
3400
3401         spin_lock(&space_info->lock);
3402         if (space_info->bytes_pinned < num_bytes)
3403                 ret = 1;
3404         spin_unlock(&space_info->lock);
3405         if (ret)
3406                 return -ENOSPC;
3407
3408         (*retries)++;
3409
3410         if (trans)
3411                 return -EAGAIN;
3412
3413         trans = btrfs_join_transaction(root, 1);
3414         BUG_ON(IS_ERR(trans));
3415         ret = btrfs_commit_transaction(trans, root);
3416         BUG_ON(ret);
3417
3418         return 1;
3419 }
3420
3421 static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3422                                   u64 num_bytes)
3423 {
3424         struct btrfs_space_info *space_info = block_rsv->space_info;
3425         u64 unused;
3426         int ret = -ENOSPC;
3427
3428         spin_lock(&space_info->lock);
3429         unused = space_info->bytes_used + space_info->bytes_reserved +
3430                  space_info->bytes_pinned + space_info->bytes_readonly;
3431
3432         if (unused < space_info->total_bytes)
3433                 unused = space_info->total_bytes - unused;
3434         else
3435                 unused = 0;
3436
3437         if (unused >= num_bytes) {
3438                 if (block_rsv->priority >= 10) {
3439                         space_info->bytes_reserved += num_bytes;
3440                         ret = 0;
3441                 } else {
3442                         if ((unused + block_rsv->reserved) *
3443                             block_rsv->priority >=
3444                             (num_bytes + block_rsv->reserved) * 10) {
3445                                 space_info->bytes_reserved += num_bytes;
3446                                 ret = 0;
3447                         }
3448                 }
3449         }
3450         spin_unlock(&space_info->lock);
3451
3452         return ret;
3453 }
3454
3455 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3456                                              struct btrfs_root *root)
3457 {
3458         struct btrfs_block_rsv *block_rsv;
3459         if (root->ref_cows)
3460                 block_rsv = trans->block_rsv;
3461         else
3462                 block_rsv = root->block_rsv;
3463
3464         if (!block_rsv)
3465                 block_rsv = &root->fs_info->empty_block_rsv;
3466
3467         return block_rsv;
3468 }
3469
3470 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3471                                u64 num_bytes)
3472 {
3473         int ret = -ENOSPC;
3474         spin_lock(&block_rsv->lock);
3475         if (block_rsv->reserved >= num_bytes) {
3476                 block_rsv->reserved -= num_bytes;
3477                 if (block_rsv->reserved < block_rsv->size)
3478                         block_rsv->full = 0;
3479                 ret = 0;
3480         }
3481         spin_unlock(&block_rsv->lock);
3482         return ret;
3483 }
3484
3485 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3486                                 u64 num_bytes, int update_size)
3487 {
3488         spin_lock(&block_rsv->lock);
3489         block_rsv->reserved += num_bytes;
3490         if (update_size)
3491                 block_rsv->size += num_bytes;
3492         else if (block_rsv->reserved >= block_rsv->size)
3493                 block_rsv->full = 1;
3494         spin_unlock(&block_rsv->lock);
3495 }
3496
3497 void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3498                              struct btrfs_block_rsv *dest, u64 num_bytes)
3499 {
3500         struct btrfs_space_info *space_info = block_rsv->space_info;
3501
3502         spin_lock(&block_rsv->lock);
3503         if (num_bytes == (u64)-1)
3504                 num_bytes = block_rsv->size;
3505         block_rsv->size -= num_bytes;
3506         if (block_rsv->reserved >= block_rsv->size) {
3507                 num_bytes = block_rsv->reserved - block_rsv->size;
3508                 block_rsv->reserved = block_rsv->size;
3509                 block_rsv->full = 1;
3510         } else {
3511                 num_bytes = 0;
3512         }
3513         spin_unlock(&block_rsv->lock);
3514
3515         if (num_bytes > 0) {
3516                 if (dest) {
3517                         block_rsv_add_bytes(dest, num_bytes, 0);
3518                 } else {
3519                         spin_lock(&space_info->lock);
3520                         space_info->bytes_reserved -= num_bytes;
3521                         spin_unlock(&space_info->lock);
3522                 }
3523         }
3524 }
3525
3526 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3527                                    struct btrfs_block_rsv *dst, u64 num_bytes)
3528 {
3529         int ret;
3530
3531         ret = block_rsv_use_bytes(src, num_bytes);
3532         if (ret)
3533                 return ret;
3534
3535         block_rsv_add_bytes(dst, num_bytes, 1);
3536         return 0;
3537 }
3538
3539 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3540 {
3541         memset(rsv, 0, sizeof(*rsv));
3542         spin_lock_init(&rsv->lock);
3543         atomic_set(&rsv->usage, 1);
3544         rsv->priority = 6;
3545         INIT_LIST_HEAD(&rsv->list);
3546 }
3547
3548 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3549 {
3550         struct btrfs_block_rsv *block_rsv;
3551         struct btrfs_fs_info *fs_info = root->fs_info;
3552         u64 alloc_target;
3553
3554         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3555         if (!block_rsv)
3556                 return NULL;
3557
3558         btrfs_init_block_rsv(block_rsv);
3559
3560         alloc_target = btrfs_get_alloc_profile(root, 0);
3561         block_rsv->space_info = __find_space_info(fs_info,
3562                                                   BTRFS_BLOCK_GROUP_METADATA);
3563
3564         return block_rsv;
3565 }
3566
3567 void btrfs_free_block_rsv(struct btrfs_root *root,
3568                           struct btrfs_block_rsv *rsv)
3569 {
3570         if (rsv && atomic_dec_and_test(&rsv->usage)) {
3571                 btrfs_block_rsv_release(root, rsv, (u64)-1);
3572                 if (!rsv->durable)
3573                         kfree(rsv);
3574         }
3575 }
3576
3577 /*
3578  * make the block_rsv struct be able to capture freed space.
3579  * the captured space will re-add to the the block_rsv struct
3580  * after transaction commit
3581  */
3582 void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3583                                  struct btrfs_block_rsv *block_rsv)
3584 {
3585         block_rsv->durable = 1;
3586         mutex_lock(&fs_info->durable_block_rsv_mutex);
3587         list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3588         mutex_unlock(&fs_info->durable_block_rsv_mutex);
3589 }
3590
3591 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3592                         struct btrfs_root *root,
3593                         struct btrfs_block_rsv *block_rsv,
3594                         u64 num_bytes, int *retries)
3595 {
3596         int ret;
3597
3598         if (num_bytes == 0)
3599                 return 0;
3600 again:
3601         ret = reserve_metadata_bytes(block_rsv, num_bytes);
3602         if (!ret) {
3603                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3604                 return 0;
3605         }
3606
3607         ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3608         if (ret > 0)
3609                 goto again;
3610
3611         return ret;
3612 }
3613
3614 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3615                           struct btrfs_root *root,
3616                           struct btrfs_block_rsv *block_rsv,
3617                           u64 min_reserved, int min_factor)
3618 {
3619         u64 num_bytes = 0;
3620         int commit_trans = 0;
3621         int ret = -ENOSPC;
3622
3623         if (!block_rsv)
3624                 return 0;
3625
3626         spin_lock(&block_rsv->lock);
3627         if (min_factor > 0)
3628                 num_bytes = div_factor(block_rsv->size, min_factor);
3629         if (min_reserved > num_bytes)
3630                 num_bytes = min_reserved;
3631
3632         if (block_rsv->reserved >= num_bytes) {
3633                 ret = 0;
3634         } else {
3635                 num_bytes -= block_rsv->reserved;
3636                 if (block_rsv->durable &&
3637                     block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3638                         commit_trans = 1;
3639         }
3640         spin_unlock(&block_rsv->lock);
3641         if (!ret)
3642                 return 0;
3643
3644         if (block_rsv->refill_used) {
3645                 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3646                 if (!ret) {
3647                         block_rsv_add_bytes(block_rsv, num_bytes, 0);
3648                         return 0;
3649                 }
3650         }
3651
3652         if (commit_trans) {
3653                 if (trans)
3654                         return -EAGAIN;
3655
3656                 trans = btrfs_join_transaction(root, 1);
3657                 BUG_ON(IS_ERR(trans));
3658                 ret = btrfs_commit_transaction(trans, root);
3659                 return 0;
3660         }
3661
3662         WARN_ON(1);
3663         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3664                 block_rsv->size, block_rsv->reserved,
3665                 block_rsv->freed[0], block_rsv->freed[1]);
3666
3667         return -ENOSPC;
3668 }
3669
3670 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3671                             struct btrfs_block_rsv *dst_rsv,
3672                             u64 num_bytes)
3673 {
3674         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3675 }
3676
3677 void btrfs_block_rsv_release(struct btrfs_root *root,
3678                              struct btrfs_block_rsv *block_rsv,
3679                              u64 num_bytes)
3680 {
3681         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3682         if (global_rsv->full || global_rsv == block_rsv ||
3683             block_rsv->space_info != global_rsv->space_info)
3684                 global_rsv = NULL;
3685         block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3686 }
3687
3688 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3689 {
3690         struct btrfs_space_info *space_info;
3691
3692         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3693         fs_info->chunk_block_rsv.space_info = space_info;
3694         fs_info->chunk_block_rsv.priority = 10;
3695
3696         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3697         fs_info->trans_block_rsv.space_info = space_info;
3698         fs_info->empty_block_rsv.space_info = space_info;
3699         fs_info->empty_block_rsv.priority = 10;
3700
3701         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3702 }
3703
3704 static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3705 {
3706         return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3707                 3 * num_items;
3708 }
3709
3710 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3711                                  struct btrfs_root *root,
3712                                  int num_items, int *retries)
3713 {
3714         u64 num_bytes;
3715         int ret;
3716
3717         if (num_items == 0 || root->fs_info->chunk_root == root)
3718                 return 0;
3719
3720         num_bytes = calc_trans_metadata_size(root, num_items);
3721         ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3722                                   num_bytes, retries);
3723         if (!ret) {
3724                 trans->bytes_reserved += num_bytes;
3725                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3726         }
3727         return ret;
3728 }
3729
3730 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3731                                   struct btrfs_root *root)
3732 {
3733         if (!trans->bytes_reserved)
3734                 return;
3735
3736         BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3737         btrfs_block_rsv_release(root, trans->block_rsv,
3738                                 trans->bytes_reserved);
3739         trans->bytes_reserved = 0;
3740 }
3741
3742 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3743                                 struct btrfs_pending_snapshot *pending)
3744 {
3745         struct btrfs_root *root = pending->root;
3746         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3747         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3748         /*
3749          * two for root back/forward refs, two for directory entries
3750          * and one for root of the snapshot.
3751          */
3752         u64 num_bytes = calc_trans_metadata_size(root, 5);
3753         dst_rsv->space_info = src_rsv->space_info;
3754         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3755 }
3756
3757 static int update_block_group(struct btrfs_trans_handle *trans,
3758                               struct btrfs_root *root,
3759                               u64 bytenr, u64 num_bytes, int alloc)
3760 {
3761         struct btrfs_block_group_cache *cache;
3762         struct btrfs_fs_info *info = root->fs_info;
3763         int factor;
3764         u64 total = num_bytes;
3765         u64 old_val;
3766         u64 byte_in_group;
3767
3768         /* block accounting for super block */
3769         spin_lock(&info->delalloc_lock);
3770         old_val = btrfs_super_bytes_used(&info->super_copy);
3771         if (alloc)
3772                 old_val += num_bytes;
3773         else
3774                 old_val -= num_bytes;
3775         btrfs_set_super_bytes_used(&info->super_copy, old_val);
3776         spin_unlock(&info->delalloc_lock);
3777
3778         while (total) {
3779                 cache = btrfs_lookup_block_group(info, bytenr);
3780                 if (!cache)
3781                         return -1;
3782                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3783                                     BTRFS_BLOCK_GROUP_RAID1 |
3784                                     BTRFS_BLOCK_GROUP_RAID10))
3785                         factor = 2;
3786                 else
3787                         factor = 1;
3788                 byte_in_group = bytenr - cache->key.objectid;
3789                 WARN_ON(byte_in_group > cache->key.offset);
3790
3791                 spin_lock(&cache->space_info->lock);
3792                 spin_lock(&cache->lock);
3793                 cache->dirty = 1;
3794                 old_val = btrfs_block_group_used(&cache->item);
3795                 num_bytes = min(total, cache->key.offset - byte_in_group);
3796                 if (alloc) {
3797                         old_val += num_bytes;
3798                         btrfs_set_block_group_used(&cache->item, old_val);
3799                         cache->reserved -= num_bytes;
3800                         cache->space_info->bytes_reserved -= num_bytes;
3801                         cache->space_info->bytes_used += num_bytes;
3802                         cache->space_info->disk_used += num_bytes * factor;
3803                         spin_unlock(&cache->lock);
3804                         spin_unlock(&cache->space_info->lock);
3805                 } else {
3806                         old_val -= num_bytes;
3807                         btrfs_set_block_group_used(&cache->item, old_val);
3808                         cache->pinned += num_bytes;
3809                         cache->space_info->bytes_pinned += num_bytes;
3810                         cache->space_info->bytes_used -= num_bytes;
3811                         cache->space_info->disk_used -= num_bytes * factor;
3812                         spin_unlock(&cache->lock);
3813                         spin_unlock(&cache->space_info->lock);
3814
3815                         set_extent_dirty(info->pinned_extents,
3816                                          bytenr, bytenr + num_bytes - 1,
3817                                          GFP_NOFS | __GFP_NOFAIL);
3818                 }
3819                 btrfs_put_block_group(cache);
3820                 total -= num_bytes;
3821                 bytenr += num_bytes;
3822         }
3823         return 0;
3824 }
3825
3826 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3827 {
3828         struct btrfs_block_group_cache *cache;
3829         u64 bytenr;
3830
3831         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3832         if (!cache)
3833                 return 0;
3834
3835         bytenr = cache->key.objectid;
3836         btrfs_put_block_group(cache);
3837
3838         return bytenr;
3839 }
3840
3841 static int pin_down_extent(struct btrfs_root *root,
3842                            struct btrfs_block_group_cache *cache,
3843                            u64 bytenr, u64 num_bytes, int reserved)
3844 {
3845         spin_lock(&cache->space_info->lock);
3846         spin_lock(&cache->lock);
3847         cache->pinned += num_bytes;
3848         cache->space_info->bytes_pinned += num_bytes;
3849         if (reserved) {
3850                 cache->reserved -= num_bytes;
3851                 cache->space_info->bytes_reserved -= num_bytes;
3852         }
3853         spin_unlock(&cache->lock);
3854         spin_unlock(&cache->space_info->lock);
3855
3856         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3857                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3858         return 0;
3859 }
3860
3861 /*
3862  * this function must be called within transaction
3863  */
3864 int btrfs_pin_extent(struct btrfs_root *root,
3865                      u64 bytenr, u64 num_bytes, int reserved)
3866 {
3867         struct btrfs_block_group_cache *cache;
3868
3869         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3870         BUG_ON(!cache);
3871
3872         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3873
3874         btrfs_put_block_group(cache);
3875         return 0;
3876 }
3877
3878 /*
3879  * update size of reserved extents. this function may return -EAGAIN
3880  * if 'reserve' is true or 'sinfo' is false.
3881  */
3882 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3883                                  u64 num_bytes, int reserve, int sinfo)
3884 {
3885         int ret = 0;
3886         if (sinfo) {
3887                 struct btrfs_space_info *space_info = cache->space_info;
3888                 spin_lock(&space_info->lock);
3889                 spin_lock(&cache->lock);
3890                 if (reserve) {
3891                         if (cache->ro) {
3892                                 ret = -EAGAIN;
3893                         } else {
3894                                 cache->reserved += num_bytes;
3895                                 space_info->bytes_reserved += num_bytes;
3896                         }
3897                 } else {
3898                         if (cache->ro)
3899                                 space_info->bytes_readonly += num_bytes;
3900                         cache->reserved -= num_bytes;
3901                         space_info->bytes_reserved -= num_bytes;
3902                 }
3903                 spin_unlock(&cache->lock);
3904                 spin_unlock(&space_info->lock);
3905         } else {
3906                 spin_lock(&cache->lock);
3907                 if (cache->ro) {
3908                         ret = -EAGAIN;
3909                 } else {
3910                         if (reserve)
3911                                 cache->reserved += num_bytes;
3912                         else
3913                                 cache->reserved -= num_bytes;
3914                 }
3915                 spin_unlock(&cache->lock);
3916         }
3917         return ret;
3918 }
3919
3920 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3921                                 struct btrfs_root *root)
3922 {
3923         struct btrfs_fs_info *fs_info = root->fs_info;
3924         struct btrfs_caching_control *next;
3925         struct btrfs_caching_control *caching_ctl;
3926         struct btrfs_block_group_cache *cache;
3927
3928         down_write(&fs_info->extent_commit_sem);
3929
3930         list_for_each_entry_safe(caching_ctl, next,
3931                                  &fs_info->caching_block_groups, list) {
3932                 cache = caching_ctl->block_group;
3933                 if (block_group_cache_done(cache)) {
3934                         cache->last_byte_to_unpin = (u64)-1;
3935                         list_del_init(&caching_ctl->list);
3936                         put_caching_control(caching_ctl);
3937                 } else {
3938                         cache->last_byte_to_unpin = caching_ctl->progress;
3939                 }
3940         }
3941
3942         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3943                 fs_info->pinned_extents = &fs_info->freed_extents[1];
3944         else
3945                 fs_info->pinned_extents = &fs_info->freed_extents[0];
3946
3947         up_write(&fs_info->extent_commit_sem);
3948         return 0;
3949 }
3950
3951 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3952 {
3953         struct btrfs_fs_info *fs_info = root->fs_info;
3954         struct btrfs_block_group_cache *cache = NULL;
3955         u64 len;
3956
3957         while (start <= end) {
3958                 if (!cache ||
3959                     start >= cache->key.objectid + cache->key.offset) {
3960                         if (cache)
3961                                 btrfs_put_block_group(cache);
3962                         cache = btrfs_lookup_block_group(fs_info, start);
3963                         BUG_ON(!cache);
3964                 }
3965
3966                 len = cache->key.objectid + cache->key.offset - start;
3967                 len = min(len, end + 1 - start);
3968
3969                 if (start < cache->last_byte_to_unpin) {
3970                         len = min(len, cache->last_byte_to_unpin - start);
3971                         btrfs_add_free_space(cache, start, len);
3972                 }
3973
3974                 start += len;
3975
3976                 spin_lock(&cache->space_info->lock);
3977                 spin_lock(&cache->lock);
3978                 cache->pinned -= len;
3979                 cache->space_info->bytes_pinned -= len;
3980                 if (cache->ro) {
3981                         cache->space_info->bytes_readonly += len;
3982                 } else if (cache->reserved_pinned > 0) {
3983                         len = min(len, cache->reserved_pinned);
3984                         cache->reserved_pinned -= len;
3985                         cache->space_info->bytes_reserved += len;
3986                 }
3987                 spin_unlock(&cache->lock);
3988                 spin_unlock(&cache->space_info->lock);
3989         }
3990
3991         if (cache)
3992                 btrfs_put_block_group(cache);
3993         return 0;
3994 }
3995
3996 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3997                                struct btrfs_root *root)
3998 {
3999         struct btrfs_fs_info *fs_info = root->fs_info;
4000         struct extent_io_tree *unpin;
4001         struct btrfs_block_rsv *block_rsv;
4002         struct btrfs_block_rsv *next_rsv;
4003         u64 start;
4004         u64 end;
4005         int idx;
4006         int ret;
4007
4008         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4009                 unpin = &fs_info->freed_extents[1];
4010         else
4011                 unpin = &fs_info->freed_extents[0];
4012
4013         while (1) {
4014                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4015                                             EXTENT_DIRTY);
4016                 if (ret)
4017                         break;
4018
4019                 ret = btrfs_discard_extent(root, start, end + 1 - start);
4020
4021                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4022                 unpin_extent_range(root, start, end);
4023                 cond_resched();
4024         }
4025
4026         mutex_lock(&fs_info->durable_block_rsv_mutex);
4027         list_for_each_entry_safe(block_rsv, next_rsv,
4028                                  &fs_info->durable_block_rsv_list, list) {
4029
4030                 idx = trans->transid & 0x1;
4031                 if (block_rsv->freed[idx] > 0) {
4032                         block_rsv_add_bytes(block_rsv,
4033                                             block_rsv->freed[idx], 0);
4034                         block_rsv->freed[idx] = 0;
4035                 }
4036                 if (atomic_read(&block_rsv->usage) == 0) {
4037                         btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4038
4039                         if (block_rsv->freed[0] == 0 &&
4040                             block_rsv->freed[1] == 0) {
4041                                 list_del_init(&block_rsv->list);
4042                                 kfree(block_rsv);
4043                         }
4044                 } else {
4045                         btrfs_block_rsv_release(root, block_rsv, 0);
4046                 }
4047         }
4048         mutex_unlock(&fs_info->durable_block_rsv_mutex);
4049
4050         return 0;
4051 }
4052
4053 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4054                                 struct btrfs_root *root,
4055                                 u64 bytenr, u64 num_bytes, u64 parent,
4056                                 u64 root_objectid, u64 owner_objectid,
4057                                 u64 owner_offset, int refs_to_drop,
4058                                 struct btrfs_delayed_extent_op *extent_op)
4059 {
4060         struct btrfs_key key;
4061         struct btrfs_path *path;
4062         struct btrfs_fs_info *info = root->fs_info;
4063         struct btrfs_root *extent_root = info->extent_root;
4064         struct extent_buffer *leaf;
4065         struct btrfs_extent_item *ei;
4066         struct btrfs_extent_inline_ref *iref;
4067         int ret;
4068         int is_data;
4069         int extent_slot = 0;
4070         int found_extent = 0;
4071         int num_to_del = 1;
4072         u32 item_size;
4073         u64 refs;
4074
4075         path = btrfs_alloc_path();
4076         if (!path)
4077                 return -ENOMEM;
4078
4079         path->reada = 1;
4080         path->leave_spinning = 1;
4081
4082         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4083         BUG_ON(!is_data && refs_to_drop != 1);
4084
4085         ret = lookup_extent_backref(trans, extent_root, path, &iref,
4086                                     bytenr, num_bytes, parent,
4087                                     root_objectid, owner_objectid,
4088                                     owner_offset);
4089         if (ret == 0) {
4090                 extent_slot = path->slots[0];
4091                 while (extent_slot >= 0) {
4092                         btrfs_item_key_to_cpu(path->nodes[0], &key,
4093                                               extent_slot);
4094                         if (key.objectid != bytenr)
4095                                 break;
4096                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4097                             key.offset == num_bytes) {
4098                                 found_extent = 1;
4099                                 break;
4100                         }
4101                         if (path->slots[0] - extent_slot > 5)
4102                                 break;
4103                         extent_slot--;
4104                 }
4105 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4106                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4107                 if (found_extent && item_size < sizeof(*ei))
4108                         found_extent = 0;
4109 #endif
4110                 if (!found_extent) {
4111                         BUG_ON(iref);
4112                         ret = remove_extent_backref(trans, extent_root, path,
4113                                                     NULL, refs_to_drop,
4114                                                     is_data);
4115                         BUG_ON(ret);
4116                         btrfs_release_path(extent_root, path);
4117                         path->leave_spinning = 1;
4118
4119                         key.objectid = bytenr;
4120                         key.type = BTRFS_EXTENT_ITEM_KEY;
4121                         key.offset = num_bytes;
4122
4123                         ret = btrfs_search_slot(trans, extent_root,
4124                                                 &key, path, -1, 1);
4125                         if (ret) {
4126                                 printk(KERN_ERR "umm, got %d back from search"
4127                                        ", was looking for %llu\n", ret,
4128                                        (unsigned long long)bytenr);
4129                                 btrfs_print_leaf(extent_root, path->nodes[0]);
4130                         }
4131                         BUG_ON(ret);
4132                         extent_slot = path->slots[0];
4133                 }
4134         } else {
4135                 btrfs_print_leaf(extent_root, path->nodes[0]);
4136                 WARN_ON(1);
4137                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4138                        "parent %llu root %llu  owner %llu offset %llu\n",
4139                        (unsigned long long)bytenr,
4140                        (unsigned long long)parent,
4141                        (unsigned long long)root_objectid,
4142                        (unsigned long long)owner_objectid,
4143                        (unsigned long long)owner_offset);
4144         }
4145
4146         leaf = path->nodes[0];
4147         item_size = btrfs_item_size_nr(leaf, extent_slot);
4148 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4149         if (item_size < sizeof(*ei)) {
4150                 BUG_ON(found_extent || extent_slot != path->slots[0]);
4151                 ret = convert_extent_item_v0(trans, extent_root, path,
4152                                              owner_objectid, 0);
4153                 BUG_ON(ret < 0);
4154
4155                 btrfs_release_path(extent_root, path);
4156                 path->leave_spinning = 1;
4157
4158                 key.objectid = bytenr;
4159                 key.type = BTRFS_EXTENT_ITEM_KEY;
4160                 key.offset = num_bytes;
4161
4162                 ret = btrfs_search_slot(trans, extent_root, &key, path,
4163                                         -1, 1);
4164                 if (ret) {
4165                         printk(KERN_ERR "umm, got %d back from search"
4166                                ", was looking for %llu\n", ret,
4167                                (unsigned long long)bytenr);
4168                         btrfs_print_leaf(extent_root, path->nodes[0]);
4169                 }
4170                 BUG_ON(ret);
4171                 extent_slot = path->slots[0];
4172                 leaf = path->nodes[0];
4173                 item_size = btrfs_item_size_nr(leaf, extent_slot);
4174         }
4175 #endif
4176         BUG_ON(item_size < sizeof(*ei));
4177         ei = btrfs_item_ptr(leaf, extent_slot,
4178                             struct btrfs_extent_item);
4179         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4180                 struct btrfs_tree_block_info *bi;
4181                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4182                 bi = (struct btrfs_tree_block_info *)(ei + 1);
4183                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4184         }
4185
4186         refs = btrfs_extent_refs(leaf, ei);
4187         BUG_ON(refs < refs_to_drop);
4188         refs -= refs_to_drop;
4189
4190         if (refs > 0) {
4191                 if (extent_op)
4192                         __run_delayed_extent_op(extent_op, leaf, ei);
4193                 /*
4194                  * In the case of inline back ref, reference count will
4195                  * be updated by remove_extent_backref
4196                  */
4197                 if (iref) {
4198                         BUG_ON(!found_extent);
4199                 } else {
4200                         btrfs_set_extent_refs(leaf, ei, refs);
4201                         btrfs_mark_buffer_dirty(leaf);
4202                 }
4203                 if (found_extent) {
4204                         ret = remove_extent_backref(trans, extent_root, path,
4205                                                     iref, refs_to_drop,
4206                                                     is_data);
4207                         BUG_ON(ret);
4208                 }
4209         } else {
4210                 if (found_extent) {
4211                         BUG_ON(is_data && refs_to_drop !=
4212                                extent_data_ref_count(root, path, iref));
4213                         if (iref) {
4214                                 BUG_ON(path->slots[0] != extent_slot);
4215                         } else {
4216                                 BUG_ON(path->slots[0] != extent_slot + 1);
4217                                 path->slots[0] = extent_slot;
4218                                 num_to_del = 2;
4219                         }
4220                 }
4221
4222                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4223                                       num_to_del);
4224                 BUG_ON(ret);
4225                 btrfs_release_path(extent_root, path);
4226
4227                 if (is_data) {
4228                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4229                         BUG_ON(ret);
4230                 } else {
4231                         invalidate_mapping_pages(info->btree_inode->i_mapping,
4232                              bytenr >> PAGE_CACHE_SHIFT,
4233                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4234                 }
4235
4236                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4237                 BUG_ON(ret);
4238         }
4239         btrfs_free_path(path);
4240         return ret;
4241 }
4242
4243 /*
4244  * when we free an block, it is possible (and likely) that we free the last
4245  * delayed ref for that extent as well.  This searches the delayed ref tree for
4246  * a given extent, and if there are no other delayed refs to be processed, it
4247  * removes it from the tree.
4248  */
4249 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4250                                       struct btrfs_root *root, u64 bytenr)
4251 {
4252         struct btrfs_delayed_ref_head *head;
4253         struct btrfs_delayed_ref_root *delayed_refs;
4254         struct btrfs_delayed_ref_node *ref;
4255         struct rb_node *node;
4256         int ret = 0;
4257
4258         delayed_refs = &trans->transaction->delayed_refs;
4259         spin_lock(&delayed_refs->lock);
4260         head = btrfs_find_delayed_ref_head(trans, bytenr);
4261         if (!head)
4262                 goto out;
4263
4264         node = rb_prev(&head->node.rb_node);
4265         if (!node)
4266                 goto out;
4267
4268         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4269
4270         /* there are still entries for this ref, we can't drop it */
4271         if (ref->bytenr == bytenr)
4272                 goto out;
4273
4274         if (head->extent_op) {
4275                 if (!head->must_insert_reserved)
4276                         goto out;
4277                 kfree(head->extent_op);
4278                 head->extent_op = NULL;
4279         }
4280
4281         /*
4282          * waiting for the lock here would deadlock.  If someone else has it
4283          * locked they are already in the process of dropping it anyway
4284          */
4285         if (!mutex_trylock(&head->mutex))
4286                 goto out;
4287
4288         /*
4289          * at this point we have a head with no other entries.  Go
4290          * ahead and process it.
4291          */
4292         head->node.in_tree = 0;
4293         rb_erase(&head->node.rb_node, &delayed_refs->root);
4294
4295         delayed_refs->num_entries--;
4296
4297         /*
4298          * we don't take a ref on the node because we're removing it from the
4299          * tree, so we just steal the ref the tree was holding.
4300          */
4301         delayed_refs->num_heads--;
4302         if (list_empty(&head->cluster))
4303                 delayed_refs->num_heads_ready--;
4304
4305         list_del_init(&head->cluster);
4306         spin_unlock(&delayed_refs->lock);
4307
4308         BUG_ON(head->extent_op);
4309         if (head->must_insert_reserved)
4310                 ret = 1;
4311
4312         mutex_unlock(&head->mutex);
4313         btrfs_put_delayed_ref(&head->node);
4314         return ret;
4315 out:
4316         spin_unlock(&delayed_refs->lock);
4317         return 0;
4318 }
4319
4320 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4321                            struct btrfs_root *root,
4322                            struct extent_buffer *buf,
4323                            u64 parent, int last_ref)
4324 {
4325         struct btrfs_block_rsv *block_rsv;
4326         struct btrfs_block_group_cache *cache = NULL;
4327         int ret;
4328
4329         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4330                 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4331                                                 parent, root->root_key.objectid,
4332                                                 btrfs_header_level(buf),
4333                                                 BTRFS_DROP_DELAYED_REF, NULL);
4334                 BUG_ON(ret);
4335         }
4336
4337         if (!last_ref)
4338                 return;
4339
4340         block_rsv = get_block_rsv(trans, root);
4341         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4342         BUG_ON(block_rsv->space_info != cache->space_info);
4343
4344         if (btrfs_header_generation(buf) == trans->transid) {
4345                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4346                         ret = check_ref_cleanup(trans, root, buf->start);
4347                         if (!ret)
4348                                 goto pin;
4349                 }
4350
4351                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4352                         pin_down_extent(root, cache, buf->start, buf->len, 1);
4353                         goto pin;
4354                 }
4355
4356                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4357
4358                 btrfs_add_free_space(cache, buf->start, buf->len);
4359                 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4360                 if (ret == -EAGAIN) {
4361                         /* block group became read-only */
4362                         update_reserved_bytes(cache, buf->len, 0, 1);
4363                         goto out;
4364                 }
4365
4366                 ret = 1;
4367                 spin_lock(&block_rsv->lock);
4368                 if (block_rsv->reserved < block_rsv->size) {
4369                         block_rsv->reserved += buf->len;
4370                         ret = 0;
4371                 }
4372                 spin_unlock(&block_rsv->lock);
4373
4374                 if (ret) {
4375                         spin_lock(&cache->space_info->lock);
4376                         cache->space_info->bytes_reserved -= buf->len;
4377                         spin_unlock(&cache->space_info->lock);
4378                 }
4379                 goto out;
4380         }
4381 pin:
4382         if (block_rsv->durable && !cache->ro) {
4383                 ret = 0;
4384                 spin_lock(&cache->lock);
4385                 if (!cache->ro) {
4386                         cache->reserved_pinned += buf->len;
4387                         ret = 1;
4388                 }
4389                 spin_unlock(&cache->lock);
4390
4391                 if (ret) {
4392                         spin_lock(&block_rsv->lock);
4393                         block_rsv->freed[trans->transid & 0x1] += buf->len;
4394                         spin_unlock(&block_rsv->lock);
4395                 }
4396         }
4397 out:
4398         btrfs_put_block_group(cache);
4399 }
4400
4401 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4402                       struct btrfs_root *root,
4403                       u64 bytenr, u64 num_bytes, u64 parent,
4404                       u64 root_objectid, u64 owner, u64 offset)
4405 {
4406         int ret;
4407
4408         /*
4409          * tree log blocks never actually go into the extent allocation
4410          * tree, just update pinning info and exit early.
4411          */
4412         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4413                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4414                 /* unlocks the pinned mutex */
4415                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4416                 ret = 0;
4417         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4418                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4419                                         parent, root_objectid, (int)owner,
4420                                         BTRFS_DROP_DELAYED_REF, NULL);
4421                 BUG_ON(ret);
4422         } else {
4423                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4424                                         parent, root_objectid, owner,
4425                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4426                 BUG_ON(ret);
4427         }
4428         return ret;
4429 }
4430
4431 static u64 stripe_align(struct btrfs_root *root, u64 val)
4432 {
4433         u64 mask = ((u64)root->stripesize - 1);
4434         u64 ret = (val + mask) & ~mask;
4435         return ret;
4436 }
4437
4438 /*
4439  * when we wait for progress in the block group caching, its because
4440  * our allocation attempt failed at least once.  So, we must sleep
4441  * and let some progress happen before we try again.
4442  *
4443  * This function will sleep at least once waiting for new free space to
4444  * show up, and then it will check the block group free space numbers
4445  * for our min num_bytes.  Another option is to have it go ahead
4446  * and look in the rbtree for a free extent of a given size, but this
4447  * is a good start.
4448  */
4449 static noinline int
4450 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4451                                 u64 num_bytes)
4452 {
4453         struct btrfs_caching_control *caching_ctl;
4454         DEFINE_WAIT(wait);
4455
4456         caching_ctl = get_caching_control(cache);
4457         if (!caching_ctl)
4458                 return 0;
4459
4460         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4461                    (cache->free_space >= num_bytes));
4462
4463         put_caching_control(caching_ctl);
4464         return 0;
4465 }
4466
4467 static noinline int
4468 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4469 {
4470         struct btrfs_caching_control *caching_ctl;
4471         DEFINE_WAIT(wait);
4472
4473         caching_ctl = get_caching_control(cache);
4474         if (!caching_ctl)
4475                 return 0;
4476
4477         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4478
4479         put_caching_control(caching_ctl);
4480         return 0;
4481 }
4482
4483 static int get_block_group_index(struct btrfs_block_group_cache *cache)
4484 {
4485         int index;
4486         if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4487                 index = 0;
4488         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4489                 index = 1;
4490         else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4491                 index = 2;
4492         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4493                 index = 3;
4494         else
4495                 index = 4;
4496         return index;
4497 }
4498
4499 enum btrfs_loop_type {
4500         LOOP_FIND_IDEAL = 0,
4501         LOOP_CACHING_NOWAIT = 1,
4502         LOOP_CACHING_WAIT = 2,
4503         LOOP_ALLOC_CHUNK = 3,
4504         LOOP_NO_EMPTY_SIZE = 4,
4505 };
4506
4507 /*
4508  * walks the btree of allocated extents and find a hole of a given size.
4509  * The key ins is changed to record the hole:
4510  * ins->objectid == block start
4511  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4512  * ins->offset == number of blocks
4513  * Any available blocks before search_start are skipped.
4514  */
4515 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4516                                      struct btrfs_root *orig_root,
4517                                      u64 num_bytes, u64 empty_size,
4518                                      u64 search_start, u64 search_end,
4519                                      u64 hint_byte, struct btrfs_key *ins,
4520                                      int data)
4521 {
4522         int ret = 0;
4523         struct btrfs_root *root = orig_root->fs_info->extent_root;
4524         struct btrfs_free_cluster *last_ptr = NULL;
4525         struct btrfs_block_group_cache *block_group = NULL;
4526         int empty_cluster = 2 * 1024 * 1024;
4527         int allowed_chunk_alloc = 0;
4528         int done_chunk_alloc = 0;
4529         struct btrfs_space_info *space_info;
4530         int last_ptr_loop = 0;
4531         int loop = 0;
4532         int index = 0;
4533         bool found_uncached_bg = false;
4534         bool failed_cluster_refill = false;
4535         bool failed_alloc = false;
4536         u64 ideal_cache_percent = 0;
4537         u64 ideal_cache_offset = 0;
4538
4539         WARN_ON(num_bytes < root->sectorsize);
4540         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4541         ins->objectid = 0;
4542         ins->offset = 0;
4543
4544         space_info = __find_space_info(root->fs_info, data);
4545         if (!space_info) {
4546                 printk(KERN_ERR "No space info for %d\n", data);
4547                 return -ENOSPC;
4548         }
4549
4550         if (orig_root->ref_cows || empty_size)
4551                 allowed_chunk_alloc = 1;
4552
4553         if (data & BTRFS_BLOCK_GROUP_METADATA) {
4554                 last_ptr = &root->fs_info->meta_alloc_cluster;
4555                 if (!btrfs_test_opt(root, SSD))
4556                         empty_cluster = 64 * 1024;
4557         }
4558
4559         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4560                 last_ptr = &root->fs_info->data_alloc_cluster;
4561         }
4562
4563         if (last_ptr) {
4564                 spin_lock(&last_ptr->lock);
4565                 if (last_ptr->block_group)
4566                         hint_byte = last_ptr->window_start;
4567                 spin_unlock(&last_ptr->lock);
4568         }
4569
4570         search_start = max(search_start, first_logical_byte(root, 0));
4571         search_start = max(search_start, hint_byte);
4572
4573         if (!last_ptr)
4574                 empty_cluster = 0;
4575
4576         if (search_start == hint_byte) {
4577 ideal_cache:
4578                 block_group = btrfs_lookup_block_group(root->fs_info,
4579                                                        search_start);
4580                 /*
4581                  * we don't want to use the block group if it doesn't match our
4582                  * allocation bits, or if its not cached.
4583                  *
4584                  * However if we are re-searching with an ideal block group
4585                  * picked out then we don't care that the block group is cached.
4586                  */
4587                 if (block_group && block_group_bits(block_group, data) &&
4588                     (block_group->cached != BTRFS_CACHE_NO ||
4589                      search_start == ideal_cache_offset)) {
4590                         down_read(&space_info->groups_sem);
4591                         if (list_empty(&block_group->list) ||
4592                             block_group->ro) {
4593                                 /*
4594                                  * someone is removing this block group,
4595                                  * we can't jump into the have_block_group
4596                                  * target because our list pointers are not
4597                                  * valid
4598                                  */
4599                                 btrfs_put_block_group(block_group);
4600                                 up_read(&space_info->groups_sem);
4601                         } else {
4602                                 index = get_block_group_index(block_group);
4603                                 goto have_block_group;
4604                         }
4605                 } else if (block_group) {
4606                         btrfs_put_block_group(block_group);
4607                 }
4608         }
4609 search:
4610         down_read(&space_info->groups_sem);
4611         list_for_each_entry(block_group, &space_info->block_groups[index],
4612                             list) {
4613                 u64 offset;
4614                 int cached;
4615
4616                 btrfs_get_block_group(block_group);
4617                 search_start = block_group->key.objectid;
4618
4619 have_block_group:
4620                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4621                         u64 free_percent;
4622
4623                         free_percent = btrfs_block_group_used(&block_group->item);
4624                         free_percent *= 100;
4625                         free_percent = div64_u64(free_percent,
4626                                                  block_group->key.offset);
4627                         free_percent = 100 - free_percent;
4628                         if (free_percent > ideal_cache_percent &&
4629                             likely(!block_group->ro)) {
4630                                 ideal_cache_offset = block_group->key.objectid;
4631                                 ideal_cache_percent = free_percent;
4632                         }
4633
4634                         /*
4635                          * We only want to start kthread caching if we are at
4636                          * the point where we will wait for caching to make
4637                          * progress, or if our ideal search is over and we've
4638                          * found somebody to start caching.
4639                          */
4640                         if (loop > LOOP_CACHING_NOWAIT ||
4641                             (loop > LOOP_FIND_IDEAL &&
4642                              atomic_read(&space_info->caching_threads) < 2)) {
4643                                 ret = cache_block_group(block_group);
4644                                 BUG_ON(ret);
4645                         }
4646                         found_uncached_bg = true;
4647
4648                         /*
4649                          * If loop is set for cached only, try the next block
4650                          * group.
4651                          */
4652                         if (loop == LOOP_FIND_IDEAL)
4653                                 goto loop;
4654                 }
4655
4656                 cached = block_group_cache_done(block_group);
4657                 if (unlikely(!cached))
4658                         found_uncached_bg = true;
4659
4660                 if (unlikely(block_group->ro))
4661                         goto loop;
4662
4663                 /*
4664                  * Ok we want to try and use the cluster allocator, so lets look
4665                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4666                  * have tried the cluster allocator plenty of times at this
4667                  * point and not have found anything, so we are likely way too
4668                  * fragmented for the clustering stuff to find anything, so lets
4669                  * just skip it and let the allocator find whatever block it can
4670                  * find
4671                  */
4672                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4673                         /*
4674                          * the refill lock keeps out other
4675                          * people trying to start a new cluster
4676                          */
4677                         spin_lock(&last_ptr->refill_lock);
4678                         if (last_ptr->block_group &&
4679                             (last_ptr->block_group->ro ||
4680                             !block_group_bits(last_ptr->block_group, data))) {
4681                                 offset = 0;
4682                                 goto refill_cluster;
4683                         }
4684
4685                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4686                                                  num_bytes, search_start);
4687                         if (offset) {
4688                                 /* we have a block, we're done */
4689                                 spin_unlock(&last_ptr->refill_lock);
4690                                 goto checks;
4691                         }
4692
4693                         spin_lock(&last_ptr->lock);
4694                         /*
4695                          * whoops, this cluster doesn't actually point to
4696                          * this block group.  Get a ref on the block
4697                          * group is does point to and try again
4698                          */
4699                         if (!last_ptr_loop && last_ptr->block_group &&
4700                             last_ptr->block_group != block_group) {
4701
4702                                 btrfs_put_block_group(block_group);
4703                                 block_group = last_ptr->block_group;
4704                                 btrfs_get_block_group(block_group);
4705                                 spin_unlock(&last_ptr->lock);
4706                                 spin_unlock(&last_ptr->refill_lock);
4707
4708                                 last_ptr_loop = 1;
4709                                 search_start = block_group->key.objectid;
4710                                 /*
4711                                  * we know this block group is properly
4712                                  * in the list because
4713                                  * btrfs_remove_block_group, drops the
4714                                  * cluster before it removes the block
4715                                  * group from the list
4716                                  */
4717                                 goto have_block_group;
4718                         }
4719                         spin_unlock(&last_ptr->lock);
4720 refill_cluster:
4721                         /*
4722                          * this cluster didn't work out, free it and
4723                          * start over
4724                          */
4725                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4726
4727                         last_ptr_loop = 0;
4728
4729                         /* allocate a cluster in this block group */
4730                         ret = btrfs_find_space_cluster(trans, root,
4731                                                block_group, last_ptr,
4732                                                offset, num_bytes,
4733                                                empty_cluster + empty_size);
4734                         if (ret == 0) {
4735                                 /*
4736                                  * now pull our allocation out of this
4737                                  * cluster
4738                                  */
4739                                 offset = btrfs_alloc_from_cluster(block_group,
4740                                                   last_ptr, num_bytes,
4741                                                   search_start);
4742                                 if (offset) {
4743                                         /* we found one, proceed */
4744                                         spin_unlock(&last_ptr->refill_lock);
4745                                         goto checks;
4746                                 }
4747                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
4748                                    && !failed_cluster_refill) {
4749                                 spin_unlock(&last_ptr->refill_lock);
4750
4751                                 failed_cluster_refill = true;
4752                                 wait_block_group_cache_progress(block_group,
4753                                        num_bytes + empty_cluster + empty_size);
4754                                 goto have_block_group;
4755                         }
4756
4757                         /*
4758                          * at this point we either didn't find a cluster
4759                          * or we weren't able to allocate a block from our
4760                          * cluster.  Free the cluster we've been trying
4761                          * to use, and go to the next block group
4762                          */
4763                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4764                         spin_unlock(&last_ptr->refill_lock);
4765                         goto loop;
4766                 }
4767
4768                 offset = btrfs_find_space_for_alloc(block_group, search_start,
4769                                                     num_bytes, empty_size);
4770                 /*
4771                  * If we didn't find a chunk, and we haven't failed on this
4772                  * block group before, and this block group is in the middle of
4773                  * caching and we are ok with waiting, then go ahead and wait
4774                  * for progress to be made, and set failed_alloc to true.
4775                  *
4776                  * If failed_alloc is true then we've already waited on this
4777                  * block group once and should move on to the next block group.
4778                  */
4779                 if (!offset && !failed_alloc && !cached &&
4780                     loop > LOOP_CACHING_NOWAIT) {
4781                         wait_block_group_cache_progress(block_group,
4782                                                 num_bytes + empty_size);
4783                         failed_alloc = true;
4784                         goto have_block_group;
4785                 } else if (!offset) {
4786                         goto loop;
4787                 }
4788 checks:
4789                 search_start = stripe_align(root, offset);
4790                 /* move on to the next group */
4791                 if (search_start + num_bytes >= search_end) {
4792                         btrfs_add_free_space(block_group, offset, num_bytes);
4793                         goto loop;
4794                 }
4795
4796                 /* move on to the next group */
4797                 if (search_start + num_bytes >
4798                     block_group->key.objectid + block_group->key.offset) {
4799                         btrfs_add_free_space(block_group, offset, num_bytes);
4800                         goto loop;
4801                 }
4802
4803                 ins->objectid = search_start;
4804                 ins->offset = num_bytes;
4805
4806                 if (offset < search_start)
4807                         btrfs_add_free_space(block_group, offset,
4808                                              search_start - offset);
4809                 BUG_ON(offset > search_start);
4810
4811                 ret = update_reserved_bytes(block_group, num_bytes, 1,
4812                                             (data & BTRFS_BLOCK_GROUP_DATA));
4813                 if (ret == -EAGAIN) {
4814                         btrfs_add_free_space(block_group, offset, num_bytes);
4815                         goto loop;
4816                 }
4817
4818                 /* we are all good, lets return */
4819                 ins->objectid = search_start;
4820                 ins->offset = num_bytes;
4821
4822                 if (offset < search_start)
4823                         btrfs_add_free_space(block_group, offset,
4824                                              search_start - offset);
4825                 BUG_ON(offset > search_start);
4826                 break;
4827 loop:
4828                 failed_cluster_refill = false;
4829                 failed_alloc = false;
4830                 BUG_ON(index != get_block_group_index(block_group));
4831                 btrfs_put_block_group(block_group);
4832         }
4833         up_read(&space_info->groups_sem);
4834
4835         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4836                 goto search;
4837
4838         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4839          *                      for them to make caching progress.  Also
4840          *                      determine the best possible bg to cache
4841          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4842          *                      caching kthreads as we move along
4843          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4844          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4845          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4846          *                      again
4847          */
4848         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4849             (found_uncached_bg || empty_size || empty_cluster ||
4850              allowed_chunk_alloc)) {
4851                 index = 0;
4852                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4853                         found_uncached_bg = false;
4854                         loop++;
4855                         if (!ideal_cache_percent &&
4856                             atomic_read(&space_info->caching_threads))
4857                                 goto search;
4858
4859                         /*
4860                          * 1 of the following 2 things have happened so far
4861                          *
4862                          * 1) We found an ideal block group for caching that
4863                          * is mostly full and will cache quickly, so we might
4864                          * as well wait for it.
4865                          *
4866                          * 2) We searched for cached only and we didn't find
4867                          * anything, and we didn't start any caching kthreads
4868                          * either, so chances are we will loop through and
4869                          * start a couple caching kthreads, and then come back
4870                          * around and just wait for them.  This will be slower
4871                          * because we will have 2 caching kthreads reading at
4872                          * the same time when we could have just started one
4873                          * and waited for it to get far enough to give us an
4874                          * allocation, so go ahead and go to the wait caching
4875                          * loop.
4876                          */
4877                         loop = LOOP_CACHING_WAIT;
4878                         search_start = ideal_cache_offset;
4879                         ideal_cache_percent = 0;
4880                         goto ideal_cache;
4881                 } else if (loop == LOOP_FIND_IDEAL) {
4882                         /*
4883                          * Didn't find a uncached bg, wait on anything we find
4884                          * next.
4885                          */
4886                         loop = LOOP_CACHING_WAIT;
4887                         goto search;
4888                 }
4889
4890                 if (loop < LOOP_CACHING_WAIT) {
4891                         loop++;
4892                         goto search;
4893                 }
4894
4895                 if (loop == LOOP_ALLOC_CHUNK) {
4896                         empty_size = 0;
4897                         empty_cluster = 0;
4898                 }
4899
4900                 if (allowed_chunk_alloc) {
4901                         ret = do_chunk_alloc(trans, root, num_bytes +
4902                                              2 * 1024 * 1024, data, 1);
4903                         allowed_chunk_alloc = 0;
4904                         done_chunk_alloc = 1;
4905                 } else if (!done_chunk_alloc) {
4906                         space_info->force_alloc = 1;
4907                 }
4908
4909                 if (loop < LOOP_NO_EMPTY_SIZE) {
4910                         loop++;
4911                         goto search;
4912                 }
4913                 ret = -ENOSPC;
4914         } else if (!ins->objectid) {
4915                 ret = -ENOSPC;
4916         }
4917
4918         /* we found what we needed */
4919         if (ins->objectid) {
4920                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
4921                         trans->block_group = block_group->key.objectid;
4922
4923                 btrfs_put_block_group(block_group);
4924                 ret = 0;
4925         }
4926
4927         return ret;
4928 }
4929
4930 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4931                             int dump_block_groups)
4932 {
4933         struct btrfs_block_group_cache *cache;
4934         int index = 0;
4935
4936         spin_lock(&info->lock);
4937         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4938                (unsigned long long)(info->total_bytes - info->bytes_used -
4939                                     info->bytes_pinned - info->bytes_reserved -
4940                                     info->bytes_super),
4941                (info->full) ? "" : "not ");
4942         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4943                " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4944                "\n",
4945                (unsigned long long)info->total_bytes,
4946                (unsigned long long)info->bytes_pinned,
4947                (unsigned long long)info->bytes_delalloc,
4948                (unsigned long long)info->bytes_may_use,
4949                (unsigned long long)info->bytes_used,
4950                (unsigned long long)info->bytes_root,
4951                (unsigned long long)info->bytes_super,
4952                (unsigned long long)info->bytes_reserved);
4953         spin_unlock(&info->lock);
4954
4955         if (!dump_block_groups)
4956                 return;
4957
4958         down_read(&info->groups_sem);
4959 again:
4960         list_for_each_entry(cache, &info->block_groups[index], list) {
4961                 spin_lock(&cache->lock);
4962                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4963                        "%llu pinned %llu reserved\n",
4964                        (unsigned long long)cache->key.objectid,
4965                        (unsigned long long)cache->key.offset,
4966                        (unsigned long long)btrfs_block_group_used(&cache->item),
4967                        (unsigned long long)cache->pinned,
4968                        (unsigned long long)cache->reserved);
4969                 btrfs_dump_free_space(cache, bytes);
4970                 spin_unlock(&cache->lock);
4971         }
4972         if (++index < BTRFS_NR_RAID_TYPES)
4973                 goto again;
4974         up_read(&info->groups_sem);
4975 }
4976
4977 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4978                          struct btrfs_root *root,
4979                          u64 num_bytes, u64 min_alloc_size,
4980                          u64 empty_size, u64 hint_byte,
4981                          u64 search_end, struct btrfs_key *ins,
4982                          u64 data)
4983 {
4984         int ret;
4985         u64 search_start = 0;
4986
4987         data = btrfs_get_alloc_profile(root, data);
4988 again:
4989         /*
4990          * the only place that sets empty_size is btrfs_realloc_node, which
4991          * is not called recursively on allocations
4992          */
4993         if (empty_size || root->ref_cows)
4994                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4995                                      num_bytes + 2 * 1024 * 1024, data, 0);
4996
4997         WARN_ON(num_bytes < root->sectorsize);
4998         ret = find_free_extent(trans, root, num_bytes, empty_size,
4999                                search_start, search_end, hint_byte,
5000                                ins, data);
5001
5002         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5003                 num_bytes = num_bytes >> 1;
5004                 num_bytes = num_bytes & ~(root->sectorsize - 1);
5005                 num_bytes = max(num_bytes, min_alloc_size);
5006                 do_chunk_alloc(trans, root->fs_info->extent_root,
5007                                num_bytes, data, 1);
5008                 goto again;
5009         }
5010         if (ret == -ENOSPC) {
5011                 struct btrfs_space_info *sinfo;
5012
5013                 sinfo = __find_space_info(root->fs_info, data);
5014                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
5015                        "wanted %llu\n", (unsigned long long)data,
5016                        (unsigned long long)num_bytes);
5017                 dump_space_info(sinfo, num_bytes, 1);
5018         }
5019
5020         return ret;
5021 }
5022
5023 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5024 {
5025         struct btrfs_block_group_cache *cache;
5026         int ret = 0;
5027
5028         cache = btrfs_lookup_block_group(root->fs_info, start);
5029         if (!cache) {
5030                 printk(KERN_ERR "Unable to find block group for %llu\n",
5031                        (unsigned long long)start);
5032                 return -ENOSPC;
5033         }
5034
5035         ret = btrfs_discard_extent(root, start, len);
5036
5037         btrfs_add_free_space(cache, start, len);
5038         update_reserved_bytes(cache, len, 0, 1);
5039         btrfs_put_block_group(cache);
5040
5041         return ret;
5042 }
5043
5044 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5045                                       struct btrfs_root *root,
5046                                       u64 parent, u64 root_objectid,
5047                                       u64 flags, u64 owner, u64 offset,
5048                                       struct btrfs_key *ins, int ref_mod)
5049 {
5050         int ret;
5051         struct btrfs_fs_info *fs_info = root->fs_info;
5052         struct btrfs_extent_item *extent_item;
5053         struct btrfs_extent_inline_ref *iref;
5054         struct btrfs_path *path;
5055         struct extent_buffer *leaf;
5056         int type;
5057         u32 size;
5058
5059         if (parent > 0)
5060                 type = BTRFS_SHARED_DATA_REF_KEY;
5061         else
5062                 type = BTRFS_EXTENT_DATA_REF_KEY;
5063
5064         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5065
5066         path = btrfs_alloc_path();
5067         BUG_ON(!path);
5068
5069         path->leave_spinning = 1;
5070         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5071                                       ins, size);
5072         BUG_ON(ret);
5073
5074         leaf = path->nodes[0];
5075         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5076                                      struct btrfs_extent_item);
5077         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5078         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5079         btrfs_set_extent_flags(leaf, extent_item,
5080                                flags | BTRFS_EXTENT_FLAG_DATA);
5081
5082         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5083         btrfs_set_extent_inline_ref_type(leaf, iref, type);
5084         if (parent > 0) {
5085                 struct btrfs_shared_data_ref *ref;
5086                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
5087                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5088                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5089         } else {
5090                 struct btrfs_extent_data_ref *ref;
5091                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5092                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5093                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5094                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5095                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5096         }
5097
5098         btrfs_mark_buffer_dirty(path->nodes[0]);
5099         btrfs_free_path(path);
5100
5101         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5102         if (ret) {
5103                 printk(KERN_ERR "btrfs update block group failed for %llu "
5104                        "%llu\n", (unsigned long long)ins->objectid,
5105                        (unsigned long long)ins->offset);
5106                 BUG();
5107         }
5108         return ret;
5109 }
5110
5111 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5112                                      struct btrfs_root *root,
5113                                      u64 parent, u64 root_objectid,
5114                                      u64 flags, struct btrfs_disk_key *key,
5115                                      int level, struct btrfs_key *ins)
5116 {
5117         int ret;
5118         struct btrfs_fs_info *fs_info = root->fs_info;
5119         struct btrfs_extent_item *extent_item;
5120         struct btrfs_tree_block_info *block_info;
5121         struct btrfs_extent_inline_ref *iref;
5122         struct btrfs_path *path;
5123         struct extent_buffer *leaf;
5124         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5125
5126         path = btrfs_alloc_path();
5127         BUG_ON(!path);
5128
5129         path->leave_spinning = 1;
5130         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5131                                       ins, size);
5132         BUG_ON(ret);
5133
5134         leaf = path->nodes[0];
5135         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5136                                      struct btrfs_extent_item);
5137         btrfs_set_extent_refs(leaf, extent_item, 1);
5138         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5139         btrfs_set_extent_flags(leaf, extent_item,
5140                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5141         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5142
5143         btrfs_set_tree_block_key(leaf, block_info, key);
5144         btrfs_set_tree_block_level(leaf, block_info, level);
5145
5146         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5147         if (parent > 0) {
5148                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5149                 btrfs_set_extent_inline_ref_type(leaf, iref,
5150                                                  BTRFS_SHARED_BLOCK_REF_KEY);
5151                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5152         } else {
5153                 btrfs_set_extent_inline_ref_type(leaf, iref,
5154                                                  BTRFS_TREE_BLOCK_REF_KEY);
5155                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5156         }
5157
5158         btrfs_mark_buffer_dirty(leaf);
5159         btrfs_free_path(path);
5160
5161         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5162         if (ret) {
5163                 printk(KERN_ERR "btrfs update block group failed for %llu "
5164                        "%llu\n", (unsigned long long)ins->objectid,
5165                        (unsigned long long)ins->offset);
5166                 BUG();
5167         }
5168         return ret;
5169 }
5170
5171 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5172                                      struct btrfs_root *root,
5173                                      u64 root_objectid, u64 owner,
5174                                      u64 offset, struct btrfs_key *ins)
5175 {
5176         int ret;
5177
5178         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5179
5180         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5181                                          0, root_objectid, owner, offset,
5182                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
5183         return ret;
5184 }
5185
5186 /*
5187  * this is used by the tree logging recovery code.  It records that
5188  * an extent has been allocated and makes sure to clear the free
5189  * space cache bits as well
5190  */
5191 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5192                                    struct btrfs_root *root,
5193                                    u64 root_objectid, u64 owner, u64 offset,
5194                                    struct btrfs_key *ins)
5195 {
5196         int ret;
5197         struct btrfs_block_group_cache *block_group;
5198         struct btrfs_caching_control *caching_ctl;
5199         u64 start = ins->objectid;
5200         u64 num_bytes = ins->offset;
5201
5202         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5203         cache_block_group(block_group);
5204         caching_ctl = get_caching_control(block_group);
5205
5206         if (!caching_ctl) {
5207                 BUG_ON(!block_group_cache_done(block_group));
5208                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5209                 BUG_ON(ret);
5210         } else {
5211                 mutex_lock(&caching_ctl->mutex);
5212
5213                 if (start >= caching_ctl->progress) {
5214                         ret = add_excluded_extent(root, start, num_bytes);
5215                         BUG_ON(ret);
5216                 } else if (start + num_bytes <= caching_ctl->progress) {
5217                         ret = btrfs_remove_free_space(block_group,
5218                                                       start, num_bytes);
5219                         BUG_ON(ret);
5220                 } else {
5221                         num_bytes = caching_ctl->progress - start;
5222                         ret = btrfs_remove_free_space(block_group,
5223                                                       start, num_bytes);
5224                         BUG_ON(ret);
5225
5226                         start = caching_ctl->progress;
5227                         num_bytes = ins->objectid + ins->offset -
5228                                     caching_ctl->progress;
5229                         ret = add_excluded_extent(root, start, num_bytes);
5230                         BUG_ON(ret);
5231                 }
5232
5233                 mutex_unlock(&caching_ctl->mutex);
5234                 put_caching_control(caching_ctl);
5235         }
5236
5237         ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5238         BUG_ON(ret);
5239         btrfs_put_block_group(block_group);
5240         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5241                                          0, owner, offset, ins, 1);
5242         return ret;
5243 }
5244
5245 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5246                                             struct btrfs_root *root,
5247                                             u64 bytenr, u32 blocksize,
5248                                             int level)
5249 {
5250         struct extent_buffer *buf;
5251
5252         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5253         if (!buf)
5254                 return ERR_PTR(-ENOMEM);
5255         btrfs_set_header_generation(buf, trans->transid);
5256         btrfs_set_buffer_lockdep_class(buf, level);
5257         btrfs_tree_lock(buf);
5258         clean_tree_block(trans, root, buf);
5259
5260         btrfs_set_lock_blocking(buf);
5261         btrfs_set_buffer_uptodate(buf);
5262
5263         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5264                 /*
5265                  * we allow two log transactions at a time, use different
5266                  * EXENT bit to differentiate dirty pages.
5267                  */
5268                 if (root->log_transid % 2 == 0)
5269                         set_extent_dirty(&root->dirty_log_pages, buf->start,
5270                                         buf->start + buf->len - 1, GFP_NOFS);
5271                 else
5272                         set_extent_new(&root->dirty_log_pages, buf->start,
5273                                         buf->start + buf->len - 1, GFP_NOFS);
5274         } else {
5275                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5276                          buf->start + buf->len - 1, GFP_NOFS);
5277         }
5278         trans->blocks_used++;
5279         /* this returns a buffer locked for blocking */
5280         return buf;
5281 }
5282
5283 static struct btrfs_block_rsv *
5284 use_block_rsv(struct btrfs_trans_handle *trans,
5285               struct btrfs_root *root, u32 blocksize)
5286 {
5287         struct btrfs_block_rsv *block_rsv;
5288         int ret;
5289
5290         block_rsv = get_block_rsv(trans, root);
5291
5292         if (block_rsv->size == 0) {
5293                 ret = reserve_metadata_bytes(block_rsv, blocksize);
5294                 if (ret)
5295                         return ERR_PTR(ret);
5296                 return block_rsv;
5297         }
5298
5299         ret = block_rsv_use_bytes(block_rsv, blocksize);
5300         if (!ret)
5301                 return block_rsv;
5302
5303         WARN_ON(1);
5304         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5305                 block_rsv->size, block_rsv->reserved,
5306                 block_rsv->freed[0], block_rsv->freed[1]);
5307
5308         return ERR_PTR(-ENOSPC);
5309 }
5310
5311 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5312 {
5313         block_rsv_add_bytes(block_rsv, blocksize, 0);
5314         block_rsv_release_bytes(block_rsv, NULL, 0);
5315 }
5316
5317 /*
5318  * finds a free extent and does all the dirty work required for allocation
5319  * returns the key for the extent through ins, and a tree buffer for
5320  * the first block of the extent through buf.
5321  *
5322  * returns the tree buffer or NULL.
5323  */
5324 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5325                                         struct btrfs_root *root, u32 blocksize,
5326                                         u64 parent, u64 root_objectid,
5327                                         struct btrfs_disk_key *key, int level,
5328                                         u64 hint, u64 empty_size)
5329 {
5330         struct btrfs_key ins;
5331         struct btrfs_block_rsv *block_rsv;
5332         struct extent_buffer *buf;
5333         u64 flags = 0;
5334         int ret;
5335
5336
5337         block_rsv = use_block_rsv(trans, root, blocksize);
5338         if (IS_ERR(block_rsv))
5339                 return ERR_CAST(block_rsv);
5340
5341         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5342                                    empty_size, hint, (u64)-1, &ins, 0);
5343         if (ret) {
5344                 unuse_block_rsv(block_rsv, blocksize);
5345                 return ERR_PTR(ret);
5346         }
5347
5348         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5349                                     blocksize, level);
5350         BUG_ON(IS_ERR(buf));
5351
5352         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5353                 if (parent == 0)
5354                         parent = ins.objectid;
5355                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5356         } else
5357                 BUG_ON(parent > 0);
5358
5359         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5360                 struct btrfs_delayed_extent_op *extent_op;
5361                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5362                 BUG_ON(!extent_op);
5363                 if (key)
5364                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
5365                 else
5366                         memset(&extent_op->key, 0, sizeof(extent_op->key));
5367                 extent_op->flags_to_set = flags;
5368                 extent_op->update_key = 1;
5369                 extent_op->update_flags = 1;
5370                 extent_op->is_data = 0;
5371
5372                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5373                                         ins.offset, parent, root_objectid,
5374                                         level, BTRFS_ADD_DELAYED_EXTENT,
5375                                         extent_op);
5376                 BUG_ON(ret);
5377         }
5378         return buf;
5379 }
5380
5381 struct walk_control {
5382         u64 refs[BTRFS_MAX_LEVEL];
5383         u64 flags[BTRFS_MAX_LEVEL];
5384         struct btrfs_key update_progress;
5385         int stage;
5386         int level;
5387         int shared_level;
5388         int update_ref;
5389         int keep_locks;
5390         int reada_slot;
5391         int reada_count;
5392 };
5393
5394 #define DROP_REFERENCE  1
5395 #define UPDATE_BACKREF  2
5396
5397 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5398                                      struct btrfs_root *root,
5399                                      struct walk_control *wc,
5400                                      struct btrfs_path *path)
5401 {
5402         u64 bytenr;
5403         u64 generation;
5404         u64 refs;
5405         u64 flags;
5406         u64 last = 0;
5407         u32 nritems;
5408         u32 blocksize;
5409         struct btrfs_key key;
5410         struct extent_buffer *eb;
5411         int ret;
5412         int slot;
5413         int nread = 0;
5414
5415         if (path->slots[wc->level] < wc->reada_slot) {
5416                 wc->reada_count = wc->reada_count * 2 / 3;
5417                 wc->reada_count = max(wc->reada_count, 2);
5418         } else {
5419                 wc->reada_count = wc->reada_count * 3 / 2;
5420                 wc->reada_count = min_t(int, wc->reada_count,
5421                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5422         }
5423
5424         eb = path->nodes[wc->level];
5425         nritems = btrfs_header_nritems(eb);
5426         blocksize = btrfs_level_size(root, wc->level - 1);
5427
5428         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5429                 if (nread >= wc->reada_count)
5430                         break;
5431
5432                 cond_resched();
5433                 bytenr = btrfs_node_blockptr(eb, slot);
5434                 generation = btrfs_node_ptr_generation(eb, slot);
5435
5436                 if (slot == path->slots[wc->level])
5437                         goto reada;
5438
5439                 if (wc->stage == UPDATE_BACKREF &&
5440                     generation <= root->root_key.offset)
5441                         continue;
5442
5443                 /* We don't lock the tree block, it's OK to be racy here */
5444                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5445                                                &refs, &flags);
5446                 BUG_ON(ret);
5447                 BUG_ON(refs == 0);
5448
5449                 if (wc->stage == DROP_REFERENCE) {
5450                         if (refs == 1)
5451                                 goto reada;
5452
5453                         if (wc->level == 1 &&
5454                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5455                                 continue;
5456                         if (!wc->update_ref ||
5457                             generation <= root->root_key.offset)
5458                                 continue;
5459                         btrfs_node_key_to_cpu(eb, &key, slot);
5460                         ret = btrfs_comp_cpu_keys(&key,
5461                                                   &wc->update_progress);
5462                         if (ret < 0)
5463                                 continue;
5464                 } else {
5465                         if (wc->level == 1 &&
5466                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5467                                 continue;
5468                 }
5469 reada:
5470                 ret = readahead_tree_block(root, bytenr, blocksize,
5471                                            generation);
5472                 if (ret)
5473                         break;
5474                 last = bytenr + blocksize;
5475                 nread++;
5476         }
5477         wc->reada_slot = slot;
5478 }
5479
5480 /*
5481  * hepler to process tree block while walking down the tree.
5482  *
5483  * when wc->stage == UPDATE_BACKREF, this function updates
5484  * back refs for pointers in the block.
5485  *
5486  * NOTE: return value 1 means we should stop walking down.
5487  */
5488 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5489                                    struct btrfs_root *root,
5490                                    struct btrfs_path *path,
5491                                    struct walk_control *wc, int lookup_info)
5492 {
5493         int level = wc->level;
5494         struct extent_buffer *eb = path->nodes[level];
5495         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5496         int ret;
5497
5498         if (wc->stage == UPDATE_BACKREF &&
5499             btrfs_header_owner(eb) != root->root_key.objectid)
5500                 return 1;
5501
5502         /*
5503          * when reference count of tree block is 1, it won't increase
5504          * again. once full backref flag is set, we never clear it.
5505          */
5506         if (lookup_info &&
5507             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5508              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5509                 BUG_ON(!path->locks[level]);
5510                 ret = btrfs_lookup_extent_info(trans, root,
5511                                                eb->start, eb->len,
5512                                                &wc->refs[level],
5513                                                &wc->flags[level]);
5514                 BUG_ON(ret);
5515                 BUG_ON(wc->refs[level] == 0);
5516         }
5517
5518         if (wc->stage == DROP_REFERENCE) {
5519                 if (wc->refs[level] > 1)
5520                         return 1;
5521
5522                 if (path->locks[level] && !wc->keep_locks) {
5523                         btrfs_tree_unlock(eb);
5524                         path->locks[level] = 0;
5525                 }
5526                 return 0;
5527         }
5528
5529         /* wc->stage == UPDATE_BACKREF */
5530         if (!(wc->flags[level] & flag)) {
5531                 BUG_ON(!path->locks[level]);
5532                 ret = btrfs_inc_ref(trans, root, eb, 1);
5533                 BUG_ON(ret);
5534                 ret = btrfs_dec_ref(trans, root, eb, 0);
5535                 BUG_ON(ret);
5536                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5537                                                   eb->len, flag, 0);
5538                 BUG_ON(ret);
5539                 wc->flags[level] |= flag;
5540         }
5541
5542         /*
5543          * the block is shared by multiple trees, so it's not good to
5544          * keep the tree lock
5545          */
5546         if (path->locks[level] && level > 0) {
5547                 btrfs_tree_unlock(eb);
5548                 path->locks[level] = 0;
5549         }
5550         return 0;
5551 }
5552
5553 /*
5554  * hepler to process tree block pointer.
5555  *
5556  * when wc->stage == DROP_REFERENCE, this function checks
5557  * reference count of the block pointed to. if the block
5558  * is shared and we need update back refs for the subtree
5559  * rooted at the block, this function changes wc->stage to
5560  * UPDATE_BACKREF. if the block is shared and there is no
5561  * need to update back, this function drops the reference
5562  * to the block.
5563  *
5564  * NOTE: return value 1 means we should stop walking down.
5565  */
5566 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5567                                  struct btrfs_root *root,
5568                                  struct btrfs_path *path,
5569                                  struct walk_control *wc, int *lookup_info)
5570 {
5571         u64 bytenr;
5572         u64 generation;
5573         u64 parent;
5574         u32 blocksize;
5575         struct btrfs_key key;
5576         struct extent_buffer *next;
5577         int level = wc->level;
5578         int reada = 0;
5579         int ret = 0;
5580
5581         generation = btrfs_node_ptr_generation(path->nodes[level],
5582                                                path->slots[level]);
5583         /*
5584          * if the lower level block was created before the snapshot
5585          * was created, we know there is no need to update back refs
5586          * for the subtree
5587          */
5588         if (wc->stage == UPDATE_BACKREF &&
5589             generation <= root->root_key.offset) {
5590                 *lookup_info = 1;
5591                 return 1;
5592         }
5593
5594         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5595         blocksize = btrfs_level_size(root, level - 1);
5596
5597         next = btrfs_find_tree_block(root, bytenr, blocksize);
5598         if (!next) {
5599                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5600                 if (!next)
5601                         return -ENOMEM;
5602                 reada = 1;
5603         }
5604         btrfs_tree_lock(next);
5605         btrfs_set_lock_blocking(next);
5606
5607         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5608                                        &wc->refs[level - 1],
5609                                        &wc->flags[level - 1]);
5610         BUG_ON(ret);
5611         BUG_ON(wc->refs[level - 1] == 0);
5612         *lookup_info = 0;
5613
5614         if (wc->stage == DROP_REFERENCE) {
5615                 if (wc->refs[level - 1] > 1) {
5616                         if (level == 1 &&
5617                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5618                                 goto skip;
5619
5620                         if (!wc->update_ref ||
5621                             generation <= root->root_key.offset)
5622                                 goto skip;
5623
5624                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5625                                               path->slots[level]);
5626                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5627                         if (ret < 0)
5628                                 goto skip;
5629
5630                         wc->stage = UPDATE_BACKREF;
5631                         wc->shared_level = level - 1;
5632                 }
5633         } else {
5634                 if (level == 1 &&
5635                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5636                         goto skip;
5637         }
5638
5639         if (!btrfs_buffer_uptodate(next, generation)) {
5640                 btrfs_tree_unlock(next);
5641                 free_extent_buffer(next);
5642                 next = NULL;
5643                 *lookup_info = 1;
5644         }
5645
5646         if (!next) {
5647                 if (reada && level == 1)
5648                         reada_walk_down(trans, root, wc, path);
5649                 next = read_tree_block(root, bytenr, blocksize, generation);
5650                 btrfs_tree_lock(next);
5651                 btrfs_set_lock_blocking(next);
5652         }
5653
5654         level--;
5655         BUG_ON(level != btrfs_header_level(next));
5656         path->nodes[level] = next;
5657         path->slots[level] = 0;
5658         path->locks[level] = 1;
5659         wc->level = level;
5660         if (wc->level == 1)
5661                 wc->reada_slot = 0;
5662         return 0;
5663 skip:
5664         wc->refs[level - 1] = 0;
5665         wc->flags[level - 1] = 0;
5666         if (wc->stage == DROP_REFERENCE) {
5667                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5668                         parent = path->nodes[level]->start;
5669                 } else {
5670                         BUG_ON(root->root_key.objectid !=
5671                                btrfs_header_owner(path->nodes[level]));
5672                         parent = 0;
5673                 }
5674
5675                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5676                                         root->root_key.objectid, level - 1, 0);
5677                 BUG_ON(ret);
5678         }
5679         btrfs_tree_unlock(next);
5680         free_extent_buffer(next);
5681         *lookup_info = 1;
5682         return 1;
5683 }
5684
5685 /*
5686  * hepler to process tree block while walking up the tree.
5687  *
5688  * when wc->stage == DROP_REFERENCE, this function drops
5689  * reference count on the block.
5690  *
5691  * when wc->stage == UPDATE_BACKREF, this function changes
5692  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5693  * to UPDATE_BACKREF previously while processing the block.
5694  *
5695  * NOTE: return value 1 means we should stop walking up.
5696  */
5697 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5698                                  struct btrfs_root *root,
5699                                  struct btrfs_path *path,
5700                                  struct walk_control *wc)
5701 {
5702         int ret;
5703         int level = wc->level;
5704         struct extent_buffer *eb = path->nodes[level];
5705         u64 parent = 0;
5706
5707         if (wc->stage == UPDATE_BACKREF) {
5708                 BUG_ON(wc->shared_level < level);
5709                 if (level < wc->shared_level)
5710                         goto out;
5711
5712                 ret = find_next_key(path, level + 1, &wc->update_progress);
5713                 if (ret > 0)
5714                         wc->update_ref = 0;
5715
5716                 wc->stage = DROP_REFERENCE;
5717                 wc->shared_level = -1;
5718                 path->slots[level] = 0;
5719
5720                 /*
5721                  * check reference count again if the block isn't locked.
5722                  * we should start walking down the tree again if reference
5723                  * count is one.
5724                  */
5725                 if (!path->locks[level]) {
5726                         BUG_ON(level == 0);
5727                         btrfs_tree_lock(eb);
5728                         btrfs_set_lock_blocking(eb);
5729                         path->locks[level] = 1;
5730
5731                         ret = btrfs_lookup_extent_info(trans, root,
5732                                                        eb->start, eb->len,
5733                                                        &wc->refs[level],
5734                                                        &wc->flags[level]);
5735                         BUG_ON(ret);
5736                         BUG_ON(wc->refs[level] == 0);
5737                         if (wc->refs[level] == 1) {
5738                                 btrfs_tree_unlock(eb);
5739                                 path->locks[level] = 0;
5740                                 return 1;
5741                         }
5742                 }
5743         }
5744
5745         /* wc->stage == DROP_REFERENCE */
5746         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5747
5748         if (wc->refs[level] == 1) {
5749                 if (level == 0) {
5750                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5751                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5752                         else
5753                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5754                         BUG_ON(ret);
5755                 }
5756                 /* make block locked assertion in clean_tree_block happy */
5757                 if (!path->locks[level] &&
5758                     btrfs_header_generation(eb) == trans->transid) {
5759                         btrfs_tree_lock(eb);
5760                         btrfs_set_lock_blocking(eb);
5761                         path->locks[level] = 1;
5762                 }
5763                 clean_tree_block(trans, root, eb);
5764         }
5765
5766         if (eb == root->node) {
5767                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5768                         parent = eb->start;
5769                 else
5770                         BUG_ON(root->root_key.objectid !=
5771                                btrfs_header_owner(eb));
5772         } else {
5773                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5774                         parent = path->nodes[level + 1]->start;
5775                 else
5776                         BUG_ON(root->root_key.objectid !=
5777                                btrfs_header_owner(path->nodes[level + 1]));
5778         }
5779
5780         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5781 out:
5782         wc->refs[level] = 0;
5783         wc->flags[level] = 0;
5784         return 0;
5785 }
5786
5787 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5788                                    struct btrfs_root *root,
5789                                    struct btrfs_path *path,
5790                                    struct walk_control *wc)
5791 {
5792         int level = wc->level;
5793         int lookup_info = 1;
5794         int ret;
5795
5796         while (level >= 0) {
5797                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5798                 if (ret > 0)
5799                         break;
5800
5801                 if (level == 0)
5802                         break;
5803
5804                 if (path->slots[level] >=
5805                     btrfs_header_nritems(path->nodes[level]))
5806                         break;
5807
5808                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5809                 if (ret > 0) {
5810                         path->slots[level]++;
5811                         continue;
5812                 } else if (ret < 0)
5813                         return ret;
5814                 level = wc->level;
5815         }
5816         return 0;
5817 }
5818
5819 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5820                                  struct btrfs_root *root,
5821                                  struct btrfs_path *path,
5822                                  struct walk_control *wc, int max_level)
5823 {
5824         int level = wc->level;
5825         int ret;
5826
5827         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5828         while (level < max_level && path->nodes[level]) {
5829                 wc->level = level;
5830                 if (path->slots[level] + 1 <
5831                     btrfs_header_nritems(path->nodes[level])) {
5832                         path->slots[level]++;
5833                         return 0;
5834                 } else {
5835                         ret = walk_up_proc(trans, root, path, wc);
5836                         if (ret > 0)
5837                                 return 0;
5838
5839                         if (path->locks[level]) {
5840                                 btrfs_tree_unlock(path->nodes[level]);
5841                                 path->locks[level] = 0;
5842                         }
5843                         free_extent_buffer(path->nodes[level]);
5844                         path->nodes[level] = NULL;
5845                         level++;
5846                 }
5847         }
5848         return 1;
5849 }
5850
5851 /*
5852  * drop a subvolume tree.
5853  *
5854  * this function traverses the tree freeing any blocks that only
5855  * referenced by the tree.
5856  *
5857  * when a shared tree block is found. this function decreases its
5858  * reference count by one. if update_ref is true, this function
5859  * also make sure backrefs for the shared block and all lower level
5860  * blocks are properly updated.
5861  */
5862 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5863 {
5864         struct btrfs_path *path;
5865         struct btrfs_trans_handle *trans;
5866         struct btrfs_root *tree_root = root->fs_info->tree_root;
5867         struct btrfs_root_item *root_item = &root->root_item;
5868         struct walk_control *wc;
5869         struct btrfs_key key;
5870         int err = 0;
5871         int ret;
5872         int level;
5873
5874         path = btrfs_alloc_path();
5875         BUG_ON(!path);
5876
5877         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5878         BUG_ON(!wc);
5879
5880         trans = btrfs_start_transaction(tree_root, 0);
5881
5882         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5883                 level = btrfs_header_level(root->node);
5884                 path->nodes[level] = btrfs_lock_root_node(root);
5885                 btrfs_set_lock_blocking(path->nodes[level]);
5886                 path->slots[level] = 0;
5887                 path->locks[level] = 1;
5888                 memset(&wc->update_progress, 0,
5889                        sizeof(wc->update_progress));
5890         } else {
5891                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5892                 memcpy(&wc->update_progress, &key,
5893                        sizeof(wc->update_progress));
5894
5895                 level = root_item->drop_level;
5896                 BUG_ON(level == 0);
5897                 path->lowest_level = level;
5898                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5899                 path->lowest_level = 0;
5900                 if (ret < 0) {
5901                         err = ret;
5902                         goto out;
5903                 }
5904                 WARN_ON(ret > 0);
5905
5906                 /*
5907                  * unlock our path, this is safe because only this
5908                  * function is allowed to delete this snapshot
5909                  */
5910                 btrfs_unlock_up_safe(path, 0);
5911
5912                 level = btrfs_header_level(root->node);
5913                 while (1) {
5914                         btrfs_tree_lock(path->nodes[level]);
5915                         btrfs_set_lock_blocking(path->nodes[level]);
5916
5917                         ret = btrfs_lookup_extent_info(trans, root,
5918                                                 path->nodes[level]->start,
5919                                                 path->nodes[level]->len,
5920                                                 &wc->refs[level],
5921                                                 &wc->flags[level]);
5922                         BUG_ON(ret);
5923                         BUG_ON(wc->refs[level] == 0);
5924
5925                         if (level == root_item->drop_level)
5926                                 break;
5927
5928                         btrfs_tree_unlock(path->nodes[level]);
5929                         WARN_ON(wc->refs[level] != 1);
5930                         level--;
5931                 }
5932         }
5933
5934         wc->level = level;
5935         wc->shared_level = -1;
5936         wc->stage = DROP_REFERENCE;
5937         wc->update_ref = update_ref;
5938         wc->keep_locks = 0;
5939         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5940
5941         while (1) {
5942                 ret = walk_down_tree(trans, root, path, wc);
5943                 if (ret < 0) {
5944                         err = ret;
5945                         break;
5946                 }
5947
5948                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5949                 if (ret < 0) {
5950                         err = ret;
5951                         break;
5952                 }
5953
5954                 if (ret > 0) {
5955                         BUG_ON(wc->stage != DROP_REFERENCE);
5956                         break;
5957                 }
5958
5959                 if (wc->stage == DROP_REFERENCE) {
5960                         level = wc->level;
5961                         btrfs_node_key(path->nodes[level],
5962                                        &root_item->drop_progress,
5963                                        path->slots[level]);
5964                         root_item->drop_level = level;
5965                 }
5966
5967                 BUG_ON(wc->level == 0);
5968                 if (trans->transaction->in_commit ||
5969                     trans->transaction->delayed_refs.flushing) {
5970                         ret = btrfs_update_root(trans, tree_root,
5971                                                 &root->root_key,
5972                                                 root_item);
5973                         BUG_ON(ret);
5974
5975                         btrfs_end_transaction(trans, tree_root);
5976                         trans = btrfs_start_transaction(tree_root, 0);
5977                         if (IS_ERR(trans))
5978                                 return PTR_ERR(trans);
5979                 } else {
5980                         unsigned long update;
5981                         update = trans->delayed_ref_updates;
5982                         trans->delayed_ref_updates = 0;
5983                         if (update)
5984                                 btrfs_run_delayed_refs(trans, tree_root,
5985                                                        update);
5986                 }
5987         }
5988         btrfs_release_path(root, path);
5989         BUG_ON(err);
5990
5991         ret = btrfs_del_root(trans, tree_root, &root->root_key);
5992         BUG_ON(ret);
5993
5994         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5995                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5996                                            NULL, NULL);
5997                 BUG_ON(ret < 0);
5998                 if (ret > 0) {
5999                         ret = btrfs_del_orphan_item(trans, tree_root,
6000                                                     root->root_key.objectid);
6001                         BUG_ON(ret);
6002                 }
6003         }
6004
6005         if (root->in_radix) {
6006                 btrfs_free_fs_root(tree_root->fs_info, root);
6007         } else {
6008                 free_extent_buffer(root->node);
6009                 free_extent_buffer(root->commit_root);
6010                 kfree(root);
6011         }
6012 out:
6013         btrfs_end_transaction(trans, tree_root);
6014         kfree(wc);
6015         btrfs_free_path(path);
6016         return err;
6017 }
6018
6019 /*
6020  * drop subtree rooted at tree block 'node'.
6021  *
6022  * NOTE: this function will unlock and release tree block 'node'
6023  */
6024 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6025                         struct btrfs_root *root,
6026                         struct extent_buffer *node,
6027                         struct extent_buffer *parent)
6028 {
6029         struct btrfs_path *path;
6030         struct walk_control *wc;
6031         int level;
6032         int parent_level;
6033         int ret = 0;
6034         int wret;
6035
6036         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6037
6038         path = btrfs_alloc_path();
6039         BUG_ON(!path);
6040
6041         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6042         BUG_ON(!wc);
6043
6044         btrfs_assert_tree_locked(parent);
6045         parent_level = btrfs_header_level(parent);
6046         extent_buffer_get(parent);
6047         path->nodes[parent_level] = parent;
6048         path->slots[parent_level] = btrfs_header_nritems(parent);
6049
6050         btrfs_assert_tree_locked(node);
6051         level = btrfs_header_level(node);
6052         path->nodes[level] = node;
6053         path->slots[level] = 0;
6054         path->locks[level] = 1;
6055
6056         wc->refs[parent_level] = 1;
6057         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6058         wc->level = level;
6059         wc->shared_level = -1;
6060         wc->stage = DROP_REFERENCE;
6061         wc->update_ref = 0;
6062         wc->keep_locks = 1;
6063         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6064
6065         while (1) {
6066                 wret = walk_down_tree(trans, root, path, wc);
6067                 if (wret < 0) {
6068                         ret = wret;
6069                         break;
6070                 }
6071
6072                 wret = walk_up_tree(trans, root, path, wc, parent_level);
6073                 if (wret < 0)
6074                         ret = wret;
6075                 if (wret != 0)
6076                         break;
6077         }
6078
6079         kfree(wc);
6080         btrfs_free_path(path);
6081         return ret;
6082 }
6083
6084 #if 0
6085 static unsigned long calc_ra(unsigned long start, unsigned long last,
6086                              unsigned long nr)
6087 {
6088         return min(last, start + nr - 1);
6089 }
6090
6091 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6092                                          u64 len)
6093 {
6094         u64 page_start;
6095         u64 page_end;
6096         unsigned long first_index;
6097         unsigned long last_index;
6098         unsigned long i;
6099         struct page *page;
6100         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6101         struct file_ra_state *ra;
6102         struct btrfs_ordered_extent *ordered;
6103         unsigned int total_read = 0;
6104         unsigned int total_dirty = 0;
6105         int ret = 0;
6106
6107         ra = kzalloc(sizeof(*ra), GFP_NOFS);
6108
6109         mutex_lock(&inode->i_mutex);
6110         first_index = start >> PAGE_CACHE_SHIFT;
6111         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
6112
6113         /* make sure the dirty trick played by the caller work */
6114         ret = invalidate_inode_pages2_range(inode->i_mapping,
6115                                             first_index, last_index);
6116         if (ret)
6117                 goto out_unlock;
6118
6119         file_ra_state_init(ra, inode->i_mapping);
6120
6121         for (i = first_index ; i <= last_index; i++) {
6122                 if (total_read % ra->ra_pages == 0) {
6123                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
6124                                        calc_ra(i, last_index, ra->ra_pages));
6125                 }
6126                 total_read++;
6127 again:
6128                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
6129                         BUG_ON(1);
6130                 page = grab_cache_page(inode->i_mapping, i);
6131                 if (!page) {
6132                         ret = -ENOMEM;
6133                         goto out_unlock;
6134                 }
6135                 if (!PageUptodate(page)) {
6136                         btrfs_readpage(NULL, page);
6137                         lock_page(page);
6138                         if (!PageUptodate(page)) {
6139                                 unlock_page(page);
6140                                 page_cache_release(page);
6141                                 ret = -EIO;
6142                                 goto out_unlock;
6143                         }
6144                 }
6145                 wait_on_page_writeback(page);
6146
6147                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
6148                 page_end = page_start + PAGE_CACHE_SIZE - 1;
6149                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
6150
6151                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
6152                 if (ordered) {
6153                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6154                         unlock_page(page);
6155                         page_cache_release(page);
6156                         btrfs_start_ordered_extent(inode, ordered, 1);
6157                         btrfs_put_ordered_extent(ordered);
6158                         goto again;
6159                 }
6160                 set_page_extent_mapped(page);
6161
6162                 if (i == first_index)
6163                         set_extent_bits(io_tree, page_start, page_end,
6164                                         EXTENT_BOUNDARY, GFP_NOFS);
6165                 btrfs_set_extent_delalloc(inode, page_start, page_end);
6166
6167                 set_page_dirty(page);
6168                 total_dirty++;
6169
6170                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6171                 unlock_page(page);
6172                 page_cache_release(page);
6173         }
6174
6175 out_unlock:
6176         kfree(ra);
6177         mutex_unlock(&inode->i_mutex);
6178         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
6179         return ret;
6180 }
6181
6182 static noinline int relocate_data_extent(struct inode *reloc_inode,
6183                                          struct btrfs_key *extent_key,
6184                                          u64 offset)
6185 {
6186         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6187         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
6188         struct extent_map *em;
6189         u64 start = extent_key->objectid - offset;
6190         u64 end = start + extent_key->offset - 1;
6191
6192         em = alloc_extent_map(GFP_NOFS);
6193         BUG_ON(!em || IS_ERR(em));
6194
6195         em->start = start;
6196         em->len = extent_key->offset;
6197         em->block_len = extent_key->offset;
6198         em->block_start = extent_key->objectid;
6199         em->bdev = root->fs_info->fs_devices->latest_bdev;
6200         set_bit(EXTENT_FLAG_PINNED, &em->flags);
6201
6202         /* setup extent map to cheat btrfs_readpage */
6203         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6204         while (1) {
6205                 int ret;
6206                 write_lock(&em_tree->lock);
6207                 ret = add_extent_mapping(em_tree, em);
6208                 write_unlock(&em_tree->lock);
6209                 if (ret != -EEXIST) {
6210                         free_extent_map(em);
6211                         break;
6212                 }
6213                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
6214         }
6215         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6216
6217         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
6218 }
6219
6220 struct btrfs_ref_path {
6221         u64 extent_start;
6222         u64 nodes[BTRFS_MAX_LEVEL];
6223         u64 root_objectid;
6224         u64 root_generation;
6225         u64 owner_objectid;
6226         u32 num_refs;
6227         int lowest_level;
6228         int current_level;
6229         int shared_level;
6230
6231         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
6232         u64 new_nodes[BTRFS_MAX_LEVEL];
6233 };
6234
6235 struct disk_extent {
6236         u64 ram_bytes;
6237         u64 disk_bytenr;
6238         u64 disk_num_bytes;
6239         u64 offset;
6240         u64 num_bytes;
6241         u8 compression;
6242         u8 encryption;
6243         u16 other_encoding;
6244 };
6245
6246 static int is_cowonly_root(u64 root_objectid)
6247 {
6248         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
6249             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
6250             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
6251             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
6252             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6253             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
6254                 return 1;
6255         return 0;
6256 }
6257
6258 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
6259                                     struct btrfs_root *extent_root,
6260                                     struct btrfs_ref_path *ref_path,
6261                                     int first_time)
6262 {
6263         struct extent_buffer *leaf;
6264         struct btrfs_path *path;
6265         struct btrfs_extent_ref *ref;
6266         struct btrfs_key key;
6267         struct btrfs_key found_key;
6268         u64 bytenr;
6269         u32 nritems;
6270         int level;
6271         int ret = 1;
6272
6273         path = btrfs_alloc_path();
6274         if (!path)
6275                 return -ENOMEM;
6276
6277         if (first_time) {
6278                 ref_path->lowest_level = -1;
6279                 ref_path->current_level = -1;
6280                 ref_path->shared_level = -1;
6281                 goto walk_up;
6282         }
6283 walk_down:
6284         level = ref_path->current_level - 1;
6285         while (level >= -1) {
6286                 u64 parent;
6287                 if (level < ref_path->lowest_level)
6288                         break;
6289
6290                 if (level >= 0)
6291                         bytenr = ref_path->nodes[level];
6292                 else
6293                         bytenr = ref_path->extent_start;
6294                 BUG_ON(bytenr == 0);
6295
6296                 parent = ref_path->nodes[level + 1];
6297                 ref_path->nodes[level + 1] = 0;
6298                 ref_path->current_level = level;
6299                 BUG_ON(parent == 0);
6300
6301                 key.objectid = bytenr;
6302                 key.offset = parent + 1;
6303                 key.type = BTRFS_EXTENT_REF_KEY;
6304
6305                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6306                 if (ret < 0)
6307                         goto out;
6308                 BUG_ON(ret == 0);
6309
6310                 leaf = path->nodes[0];
6311                 nritems = btrfs_header_nritems(leaf);
6312                 if (path->slots[0] >= nritems) {
6313                         ret = btrfs_next_leaf(extent_root, path);
6314                         if (ret < 0)
6315                                 goto out;
6316                         if (ret > 0)
6317                                 goto next;
6318                         leaf = path->nodes[0];
6319                 }
6320
6321                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6322                 if (found_key.objectid == bytenr &&
6323                     found_key.type == BTRFS_EXTENT_REF_KEY) {
6324                         if (level < ref_path->shared_level)
6325                                 ref_path->shared_level = level;
6326                         goto found;
6327                 }
6328 next:
6329                 level--;
6330                 btrfs_release_path(extent_root, path);
6331                 cond_resched();
6332         }
6333         /* reached lowest level */
6334         ret = 1;
6335         goto out;
6336 walk_up:
6337         level = ref_path->current_level;
6338         while (level < BTRFS_MAX_LEVEL - 1) {
6339                 u64 ref_objectid;
6340
6341                 if (level >= 0)
6342                         bytenr = ref_path->nodes[level];
6343                 else
6344                         bytenr = ref_path->extent_start;
6345
6346                 BUG_ON(bytenr == 0);
6347
6348                 key.objectid = bytenr;
6349                 key.offset = 0;
6350                 key.type = BTRFS_EXTENT_REF_KEY;
6351
6352                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6353                 if (ret < 0)
6354                         goto out;
6355
6356                 leaf = path->nodes[0];
6357                 nritems = btrfs_header_nritems(leaf);
6358                 if (path->slots[0] >= nritems) {
6359                         ret = btrfs_next_leaf(extent_root, path);
6360                         if (ret < 0)
6361                                 goto out;
6362                         if (ret > 0) {
6363                                 /* the extent was freed by someone */
6364                                 if (ref_path->lowest_level == level)
6365                                         goto out;
6366                                 btrfs_release_path(extent_root, path);
6367                                 goto walk_down;
6368                         }
6369                         leaf = path->nodes[0];
6370                 }
6371
6372                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6373                 if (found_key.objectid != bytenr ||
6374                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
6375                         /* the extent was freed by someone */
6376                         if (ref_path->lowest_level == level) {
6377                                 ret = 1;
6378                                 goto out;
6379                         }
6380                         btrfs_release_path(extent_root, path);
6381                         goto walk_down;
6382                 }
6383 found:
6384                 ref = btrfs_item_ptr(leaf, path->slots[0],
6385                                 struct btrfs_extent_ref);
6386                 ref_objectid = btrfs_ref_objectid(leaf, ref);
6387                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6388                         if (first_time) {
6389                                 level = (int)ref_objectid;
6390                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
6391                                 ref_path->lowest_level = level;
6392                                 ref_path->current_level = level;
6393                                 ref_path->nodes[level] = bytenr;
6394                         } else {
6395                                 WARN_ON(ref_objectid != level);
6396                         }
6397                 } else {
6398                         WARN_ON(level != -1);
6399                 }
6400                 first_time = 0;
6401
6402                 if (ref_path->lowest_level == level) {
6403                         ref_path->owner_objectid = ref_objectid;
6404                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6405                 }
6406
6407                 /*
6408                  * the block is tree root or the block isn't in reference
6409                  * counted tree.
6410                  */
6411                 if (found_key.objectid == found_key.offset ||
6412                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6413                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6414                         ref_path->root_generation =
6415                                 btrfs_ref_generation(leaf, ref);
6416                         if (level < 0) {
6417                                 /* special reference from the tree log */
6418                                 ref_path->nodes[0] = found_key.offset;
6419                                 ref_path->current_level = 0;
6420                         }
6421                         ret = 0;
6422                         goto out;
6423                 }
6424
6425                 level++;
6426                 BUG_ON(ref_path->nodes[level] != 0);
6427                 ref_path->nodes[level] = found_key.offset;
6428                 ref_path->current_level = level;
6429
6430                 /*
6431                  * the reference was created in the running transaction,
6432                  * no need to continue walking up.
6433                  */
6434                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6435                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6436                         ref_path->root_generation =
6437                                 btrfs_ref_generation(leaf, ref);
6438                         ret = 0;
6439                         goto out;
6440                 }
6441
6442                 btrfs_release_path(extent_root, path);
6443                 cond_resched();
6444         }
6445         /* reached max tree level, but no tree root found. */
6446         BUG();
6447 out:
6448         btrfs_free_path(path);
6449         return ret;
6450 }
6451
6452 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6453                                 struct btrfs_root *extent_root,
6454                                 struct btrfs_ref_path *ref_path,
6455                                 u64 extent_start)
6456 {
6457         memset(ref_path, 0, sizeof(*ref_path));
6458         ref_path->extent_start = extent_start;
6459
6460         return __next_ref_path(trans, extent_root, ref_path, 1);
6461 }
6462
6463 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6464                                struct btrfs_root *extent_root,
6465                                struct btrfs_ref_path *ref_path)
6466 {
6467         return __next_ref_path(trans, extent_root, ref_path, 0);
6468 }
6469
6470 static noinline int get_new_locations(struct inode *reloc_inode,
6471                                       struct btrfs_key *extent_key,
6472                                       u64 offset, int no_fragment,
6473                                       struct disk_extent **extents,
6474                                       int *nr_extents)
6475 {
6476         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6477         struct btrfs_path *path;
6478         struct btrfs_file_extent_item *fi;
6479         struct extent_buffer *leaf;
6480         struct disk_extent *exts = *extents;
6481         struct btrfs_key found_key;
6482         u64 cur_pos;
6483         u64 last_byte;
6484         u32 nritems;
6485         int nr = 0;
6486         int max = *nr_extents;
6487         int ret;
6488
6489         WARN_ON(!no_fragment && *extents);
6490         if (!exts) {
6491                 max = 1;
6492                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6493                 if (!exts)
6494                         return -ENOMEM;
6495         }
6496
6497         path = btrfs_alloc_path();
6498         BUG_ON(!path);
6499
6500         cur_pos = extent_key->objectid - offset;
6501         last_byte = extent_key->objectid + extent_key->offset;
6502         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6503                                        cur_pos, 0);
6504         if (ret < 0)
6505                 goto out;
6506         if (ret > 0) {
6507                 ret = -ENOENT;
6508                 goto out;
6509         }
6510
6511         while (1) {
6512                 leaf = path->nodes[0];
6513                 nritems = btrfs_header_nritems(leaf);
6514                 if (path->slots[0] >= nritems) {
6515                         ret = btrfs_next_leaf(root, path);
6516                         if (ret < 0)
6517                                 goto out;
6518                         if (ret > 0)
6519                                 break;
6520                         leaf = path->nodes[0];
6521                 }
6522
6523                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6524                 if (found_key.offset != cur_pos ||
6525                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
6526                     found_key.objectid != reloc_inode->i_ino)
6527                         break;
6528
6529                 fi = btrfs_item_ptr(leaf, path->slots[0],
6530                                     struct btrfs_file_extent_item);
6531                 if (btrfs_file_extent_type(leaf, fi) !=
6532                     BTRFS_FILE_EXTENT_REG ||
6533                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6534                         break;
6535
6536                 if (nr == max) {
6537                         struct disk_extent *old = exts;
6538                         max *= 2;
6539                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6540                         memcpy(exts, old, sizeof(*exts) * nr);
6541                         if (old != *extents)
6542                                 kfree(old);
6543                 }
6544
6545                 exts[nr].disk_bytenr =
6546                         btrfs_file_extent_disk_bytenr(leaf, fi);
6547                 exts[nr].disk_num_bytes =
6548                         btrfs_file_extent_disk_num_bytes(leaf, fi);
6549                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6550                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6551                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6552                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6553                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6554                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6555                                                                            fi);
6556                 BUG_ON(exts[nr].offset > 0);
6557                 BUG_ON(exts[nr].compression || exts[nr].encryption);
6558                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6559
6560                 cur_pos += exts[nr].num_bytes;
6561                 nr++;
6562
6563                 if (cur_pos + offset >= last_byte)
6564                         break;
6565
6566                 if (no_fragment) {
6567                         ret = 1;
6568                         goto out;
6569                 }
6570                 path->slots[0]++;
6571         }
6572
6573         BUG_ON(cur_pos + offset > last_byte);
6574         if (cur_pos + offset < last_byte) {
6575                 ret = -ENOENT;
6576                 goto out;
6577         }
6578         ret = 0;
6579 out:
6580         btrfs_free_path(path);
6581         if (ret) {
6582                 if (exts != *extents)
6583                         kfree(exts);
6584         } else {
6585                 *extents = exts;
6586                 *nr_extents = nr;
6587         }
6588         return ret;
6589 }
6590
6591 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6592                                         struct btrfs_root *root,
6593                                         struct btrfs_path *path,
6594                                         struct btrfs_key *extent_key,
6595                                         struct btrfs_key *leaf_key,
6596                                         struct btrfs_ref_path *ref_path,
6597                                         struct disk_extent *new_extents,
6598                                         int nr_extents)
6599 {
6600         struct extent_buffer *leaf;
6601         struct btrfs_file_extent_item *fi;
6602         struct inode *inode = NULL;
6603         struct btrfs_key key;
6604         u64 lock_start = 0;
6605         u64 lock_end = 0;
6606         u64 num_bytes;
6607         u64 ext_offset;
6608         u64 search_end = (u64)-1;
6609         u32 nritems;
6610         int nr_scaned = 0;
6611         int extent_locked = 0;
6612         int extent_type;
6613         int ret;
6614
6615         memcpy(&key, leaf_key, sizeof(key));
6616         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6617                 if (key.objectid < ref_path->owner_objectid ||
6618                     (key.objectid == ref_path->owner_objectid &&
6619                      key.type < BTRFS_EXTENT_DATA_KEY)) {
6620                         key.objectid = ref_path->owner_objectid;
6621                         key.type = BTRFS_EXTENT_DATA_KEY;
6622                         key.offset = 0;
6623                 }
6624         }
6625
6626         while (1) {
6627                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6628                 if (ret < 0)
6629                         goto out;
6630
6631                 leaf = path->nodes[0];
6632                 nritems = btrfs_header_nritems(leaf);
6633 next:
6634                 if (extent_locked && ret > 0) {
6635                         /*
6636                          * the file extent item was modified by someone
6637                          * before the extent got locked.
6638                          */
6639                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6640                                       lock_end, GFP_NOFS);
6641                         extent_locked = 0;
6642                 }
6643
6644                 if (path->slots[0] >= nritems) {
6645                         if (++nr_scaned > 2)
6646                                 break;
6647
6648                         BUG_ON(extent_locked);
6649                         ret = btrfs_next_leaf(root, path);
6650                         if (ret < 0)
6651                                 goto out;
6652                         if (ret > 0)
6653                                 break;
6654                         leaf = path->nodes[0];
6655                         nritems = btrfs_header_nritems(leaf);
6656                 }
6657
6658                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6659
6660                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6661                         if ((key.objectid > ref_path->owner_objectid) ||
6662                             (key.objectid == ref_path->owner_objectid &&
6663                              key.type > BTRFS_EXTENT_DATA_KEY) ||
6664                             key.offset >= search_end)
6665                                 break;
6666                 }
6667
6668                 if (inode && key.objectid != inode->i_ino) {
6669                         BUG_ON(extent_locked);
6670                         btrfs_release_path(root, path);
6671                         mutex_unlock(&inode->i_mutex);
6672                         iput(inode);
6673                         inode = NULL;
6674                         continue;
6675                 }
6676
6677                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6678                         path->slots[0]++;
6679                         ret = 1;
6680                         goto next;
6681                 }
6682                 fi = btrfs_item_ptr(leaf, path->slots[0],
6683                                     struct btrfs_file_extent_item);
6684                 extent_type = btrfs_file_extent_type(leaf, fi);
6685                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6686                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6687                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6688                      extent_key->objectid)) {
6689                         path->slots[0]++;
6690                         ret = 1;
6691                         goto next;
6692                 }
6693
6694                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6695                 ext_offset = btrfs_file_extent_offset(leaf, fi);
6696
6697                 if (search_end == (u64)-1) {
6698                         search_end = key.offset - ext_offset +
6699                                 btrfs_file_extent_ram_bytes(leaf, fi);
6700                 }
6701
6702                 if (!extent_locked) {
6703                         lock_start = key.offset;
6704                         lock_end = lock_start + num_bytes - 1;
6705                 } else {
6706                         if (lock_start > key.offset ||
6707                             lock_end + 1 < key.offset + num_bytes) {
6708                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6709                                               lock_start, lock_end, GFP_NOFS);
6710                                 extent_locked = 0;
6711                         }
6712                 }
6713
6714                 if (!inode) {
6715                         btrfs_release_path(root, path);
6716
6717                         inode = btrfs_iget_locked(root->fs_info->sb,
6718                                                   key.objectid, root);
6719                         if (inode->i_state & I_NEW) {
6720                                 BTRFS_I(inode)->root = root;
6721                                 BTRFS_I(inode)->location.objectid =
6722                                         key.objectid;
6723                                 BTRFS_I(inode)->location.type =
6724                                         BTRFS_INODE_ITEM_KEY;
6725                                 BTRFS_I(inode)->location.offset = 0;
6726                                 btrfs_read_locked_inode(inode);
6727                                 unlock_new_inode(inode);
6728                         }
6729                         /*
6730                          * some code call btrfs_commit_transaction while
6731                          * holding the i_mutex, so we can't use mutex_lock
6732                          * here.
6733                          */
6734                         if (is_bad_inode(inode) ||
6735                             !mutex_trylock(&inode->i_mutex)) {
6736                                 iput(inode);
6737                                 inode = NULL;
6738                                 key.offset = (u64)-1;
6739                                 goto skip;
6740                         }
6741                 }
6742
6743                 if (!extent_locked) {
6744                         struct btrfs_ordered_extent *ordered;
6745
6746                         btrfs_release_path(root, path);
6747
6748                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6749                                     lock_end, GFP_NOFS);
6750                         ordered = btrfs_lookup_first_ordered_extent(inode,
6751                                                                     lock_end);
6752                         if (ordered &&
6753                             ordered->file_offset <= lock_end &&
6754                             ordered->file_offset + ordered->len > lock_start) {
6755                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6756                                               lock_start, lock_end, GFP_NOFS);
6757                                 btrfs_start_ordered_extent(inode, ordered, 1);
6758                                 btrfs_put_ordered_extent(ordered);
6759                                 key.offset += num_bytes;
6760                                 goto skip;
6761                         }
6762                         if (ordered)
6763                                 btrfs_put_ordered_extent(ordered);
6764
6765                         extent_locked = 1;
6766                         continue;
6767                 }
6768
6769                 if (nr_extents == 1) {
6770                         /* update extent pointer in place */
6771                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
6772                                                 new_extents[0].disk_bytenr);
6773                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6774                                                 new_extents[0].disk_num_bytes);
6775                         btrfs_mark_buffer_dirty(leaf);
6776
6777                         btrfs_drop_extent_cache(inode, key.offset,
6778                                                 key.offset + num_bytes - 1, 0);
6779
6780                         ret = btrfs_inc_extent_ref(trans, root,
6781                                                 new_extents[0].disk_bytenr,
6782                                                 new_extents[0].disk_num_bytes,
6783                                                 leaf->start,
6784                                                 root->root_key.objectid,
6785                                                 trans->transid,
6786                                                 key.objectid);
6787                         BUG_ON(ret);
6788
6789                         ret = btrfs_free_extent(trans, root,
6790                                                 extent_key->objectid,
6791                                                 extent_key->offset,
6792                                                 leaf->start,
6793                                                 btrfs_header_owner(leaf),
6794                                                 btrfs_header_generation(leaf),
6795                                                 key.objectid, 0);
6796                         BUG_ON(ret);
6797
6798                         btrfs_release_path(root, path);
6799                         key.offset += num_bytes;
6800                 } else {
6801                         BUG_ON(1);
6802 #if 0
6803                         u64 alloc_hint;
6804                         u64 extent_len;
6805                         int i;
6806                         /*
6807                          * drop old extent pointer at first, then insert the
6808                          * new pointers one bye one
6809                          */
6810                         btrfs_release_path(root, path);
6811                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
6812                                                  key.offset + num_bytes,
6813                                                  key.offset, &alloc_hint);
6814                         BUG_ON(ret);
6815
6816                         for (i = 0; i < nr_extents; i++) {
6817                                 if (ext_offset >= new_extents[i].num_bytes) {
6818                                         ext_offset -= new_extents[i].num_bytes;
6819                                         continue;
6820                                 }
6821                                 extent_len = min(new_extents[i].num_bytes -
6822                                                  ext_offset, num_bytes);
6823
6824                                 ret = btrfs_insert_empty_item(trans, root,
6825                                                               path, &key,
6826                                                               sizeof(*fi));
6827                                 BUG_ON(ret);
6828
6829                                 leaf = path->nodes[0];
6830                                 fi = btrfs_item_ptr(leaf, path->slots[0],
6831                                                 struct btrfs_file_extent_item);
6832                                 btrfs_set_file_extent_generation(leaf, fi,
6833                                                         trans->transid);
6834                                 btrfs_set_file_extent_type(leaf, fi,
6835                                                         BTRFS_FILE_EXTENT_REG);
6836                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6837                                                 new_extents[i].disk_bytenr);
6838                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6839                                                 new_extents[i].disk_num_bytes);
6840                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
6841                                                 new_extents[i].ram_bytes);
6842
6843                                 btrfs_set_file_extent_compression(leaf, fi,
6844                                                 new_extents[i].compression);
6845                                 btrfs_set_file_extent_encryption(leaf, fi,
6846                                                 new_extents[i].encryption);
6847                                 btrfs_set_file_extent_other_encoding(leaf, fi,
6848                                                 new_extents[i].other_encoding);
6849
6850                                 btrfs_set_file_extent_num_bytes(leaf, fi,
6851                                                         extent_len);
6852                                 ext_offset += new_extents[i].offset;
6853                                 btrfs_set_file_extent_offset(leaf, fi,
6854                                                         ext_offset);
6855                                 btrfs_mark_buffer_dirty(leaf);
6856
6857                                 btrfs_drop_extent_cache(inode, key.offset,
6858                                                 key.offset + extent_len - 1, 0);
6859
6860                                 ret = btrfs_inc_extent_ref(trans, root,
6861                                                 new_extents[i].disk_bytenr,
6862                                                 new_extents[i].disk_num_bytes,
6863                                                 leaf->start,
6864                                                 root->root_key.objectid,
6865                                                 trans->transid, key.objectid);
6866                                 BUG_ON(ret);
6867                                 btrfs_release_path(root, path);
6868
6869                                 inode_add_bytes(inode, extent_len);
6870
6871                                 ext_offset = 0;
6872                                 num_bytes -= extent_len;
6873                                 key.offset += extent_len;
6874
6875                                 if (num_bytes == 0)
6876                                         break;
6877                         }
6878                         BUG_ON(i >= nr_extents);
6879 #endif
6880                 }
6881
6882                 if (extent_locked) {
6883                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6884                                       lock_end, GFP_NOFS);
6885                         extent_locked = 0;
6886                 }
6887 skip:
6888                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6889                     key.offset >= search_end)
6890                         break;
6891
6892                 cond_resched();
6893         }
6894         ret = 0;
6895 out:
6896         btrfs_release_path(root, path);
6897         if (inode) {
6898                 mutex_unlock(&inode->i_mutex);
6899                 if (extent_locked) {
6900                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6901                                       lock_end, GFP_NOFS);
6902                 }
6903                 iput(inode);
6904         }
6905         return ret;
6906 }
6907
6908 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6909                                struct btrfs_root *root,
6910                                struct extent_buffer *buf, u64 orig_start)
6911 {
6912         int level;
6913         int ret;
6914
6915         BUG_ON(btrfs_header_generation(buf) != trans->transid);
6916         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6917
6918         level = btrfs_header_level(buf);
6919         if (level == 0) {
6920                 struct btrfs_leaf_ref *ref;
6921                 struct btrfs_leaf_ref *orig_ref;
6922
6923                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6924                 if (!orig_ref)
6925                         return -ENOENT;
6926
6927                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6928                 if (!ref) {
6929                         btrfs_free_leaf_ref(root, orig_ref);
6930                         return -ENOMEM;
6931                 }
6932
6933                 ref->nritems = orig_ref->nritems;
6934                 memcpy(ref->extents, orig_ref->extents,
6935                         sizeof(ref->extents[0]) * ref->nritems);
6936
6937                 btrfs_free_leaf_ref(root, orig_ref);
6938
6939                 ref->root_gen = trans->transid;
6940                 ref->bytenr = buf->start;
6941                 ref->owner = btrfs_header_owner(buf);
6942                 ref->generation = btrfs_header_generation(buf);
6943
6944                 ret = btrfs_add_leaf_ref(root, ref, 0);
6945                 WARN_ON(ret);
6946                 btrfs_free_leaf_ref(root, ref);
6947         }
6948         return 0;
6949 }
6950
6951 static noinline int invalidate_extent_cache(struct btrfs_root *root,
6952                                         struct extent_buffer *leaf,
6953                                         struct btrfs_block_group_cache *group,
6954                                         struct btrfs_root *target_root)
6955 {
6956         struct btrfs_key key;
6957         struct inode *inode = NULL;
6958         struct btrfs_file_extent_item *fi;
6959         struct extent_state *cached_state = NULL;
6960         u64 num_bytes;
6961         u64 skip_objectid = 0;
6962         u32 nritems;
6963         u32 i;
6964
6965         nritems = btrfs_header_nritems(leaf);
6966         for (i = 0; i < nritems; i++) {
6967                 btrfs_item_key_to_cpu(leaf, &key, i);
6968                 if (key.objectid == skip_objectid ||
6969                     key.type != BTRFS_EXTENT_DATA_KEY)
6970                         continue;
6971                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6972                 if (btrfs_file_extent_type(leaf, fi) ==
6973                     BTRFS_FILE_EXTENT_INLINE)
6974                         continue;
6975                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6976                         continue;
6977                 if (!inode || inode->i_ino != key.objectid) {
6978                         iput(inode);
6979                         inode = btrfs_ilookup(target_root->fs_info->sb,
6980                                               key.objectid, target_root, 1);
6981                 }
6982                 if (!inode) {
6983                         skip_objectid = key.objectid;
6984                         continue;
6985                 }
6986                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6987
6988                 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6989                                  key.offset + num_bytes - 1, 0, &cached_state,
6990                                  GFP_NOFS);
6991                 btrfs_drop_extent_cache(inode, key.offset,
6992                                         key.offset + num_bytes - 1, 1);
6993                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6994                                      key.offset + num_bytes - 1, &cached_state,
6995                                      GFP_NOFS);
6996                 cond_resched();
6997         }
6998         iput(inode);
6999         return 0;
7000 }
7001
7002 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7003                                         struct btrfs_root *root,
7004                                         struct extent_buffer *leaf,
7005                                         struct btrfs_block_group_cache *group,
7006                                         struct inode *reloc_inode)
7007 {
7008         struct btrfs_key key;
7009         struct btrfs_key extent_key;
7010         struct btrfs_file_extent_item *fi;
7011         struct btrfs_leaf_ref *ref;
7012         struct disk_extent *new_extent;
7013         u64 bytenr;
7014         u64 num_bytes;
7015         u32 nritems;
7016         u32 i;
7017         int ext_index;
7018         int nr_extent;
7019         int ret;
7020
7021         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7022         BUG_ON(!new_extent);
7023
7024         ref = btrfs_lookup_leaf_ref(root, leaf->start);
7025         BUG_ON(!ref);
7026
7027         ext_index = -1;
7028         nritems = btrfs_header_nritems(leaf);
7029         for (i = 0; i < nritems; i++) {
7030                 btrfs_item_key_to_cpu(leaf, &key, i);
7031                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
7032                         continue;
7033                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7034                 if (btrfs_file_extent_type(leaf, fi) ==
7035                     BTRFS_FILE_EXTENT_INLINE)
7036                         continue;
7037                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7038                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
7039                 if (bytenr == 0)
7040                         continue;
7041
7042                 ext_index++;
7043                 if (bytenr >= group->key.objectid + group->key.offset ||
7044                     bytenr + num_bytes <= group->key.objectid)
7045                         continue;
7046
7047                 extent_key.objectid = bytenr;
7048                 extent_key.offset = num_bytes;
7049                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7050                 nr_extent = 1;
7051                 ret = get_new_locations(reloc_inode, &extent_key,
7052                                         group->key.objectid, 1,
7053                                         &new_extent, &nr_extent);
7054                 if (ret > 0)
7055                         continue;
7056                 BUG_ON(ret < 0);
7057
7058                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
7059                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
7060                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
7061                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
7062
7063                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
7064                                                 new_extent->disk_bytenr);
7065                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7066                                                 new_extent->disk_num_bytes);
7067                 btrfs_mark_buffer_dirty(leaf);
7068
7069                 ret = btrfs_inc_extent_ref(trans, root,
7070                                         new_extent->disk_bytenr,
7071                                         new_extent->disk_num_bytes,
7072                                         leaf->start,
7073                                         root->root_key.objectid,
7074                                         trans->transid, key.objectid);
7075                 BUG_ON(ret);
7076
7077                 ret = btrfs_free_extent(trans, root,
7078                                         bytenr, num_bytes, leaf->start,
7079                                         btrfs_header_owner(leaf),
7080                                         btrfs_header_generation(leaf),
7081                                         key.objectid, 0);
7082                 BUG_ON(ret);
7083                 cond_resched();
7084         }
7085         kfree(new_extent);
7086         BUG_ON(ext_index + 1 != ref->nritems);
7087         btrfs_free_leaf_ref(root, ref);
7088         return 0;
7089 }
7090
7091 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
7092                           struct btrfs_root *root)
7093 {
7094         struct btrfs_root *reloc_root;
7095         int ret;
7096
7097         if (root->reloc_root) {
7098                 reloc_root = root->reloc_root;
7099                 root->reloc_root = NULL;
7100                 list_add(&reloc_root->dead_list,
7101                          &root->fs_info->dead_reloc_roots);
7102
7103                 btrfs_set_root_bytenr(&reloc_root->root_item,
7104                                       reloc_root->node->start);
7105                 btrfs_set_root_level(&root->root_item,
7106                                      btrfs_header_level(reloc_root->node));
7107                 memset(&reloc_root->root_item.drop_progress, 0,
7108                         sizeof(struct btrfs_disk_key));
7109                 reloc_root->root_item.drop_level = 0;
7110
7111                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
7112                                         &reloc_root->root_key,
7113                                         &reloc_root->root_item);
7114                 BUG_ON(ret);
7115         }
7116         return 0;
7117 }
7118
7119 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7120 {
7121         struct btrfs_trans_handle *trans;
7122         struct btrfs_root *reloc_root;
7123         struct btrfs_root *prev_root = NULL;
7124         struct list_head dead_roots;
7125         int ret;
7126         unsigned long nr;
7127
7128         INIT_LIST_HEAD(&dead_roots);
7129         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
7130
7131         while (!list_empty(&dead_roots)) {
7132                 reloc_root = list_entry(dead_roots.prev,
7133                                         struct btrfs_root, dead_list);
7134                 list_del_init(&reloc_root->dead_list);
7135
7136                 BUG_ON(reloc_root->commit_root != NULL);
7137                 while (1) {
7138                         trans = btrfs_join_transaction(root, 1);
7139                         BUG_ON(!trans);
7140
7141                         mutex_lock(&root->fs_info->drop_mutex);
7142                         ret = btrfs_drop_snapshot(trans, reloc_root);
7143                         if (ret != -EAGAIN)
7144                                 break;
7145                         mutex_unlock(&root->fs_info->drop_mutex);
7146
7147                         nr = trans->blocks_used;
7148                         ret = btrfs_end_transaction(trans, root);
7149                         BUG_ON(ret);
7150                         btrfs_btree_balance_dirty(root, nr);
7151                 }
7152
7153                 free_extent_buffer(reloc_root->node);
7154
7155                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
7156                                      &reloc_root->root_key);
7157                 BUG_ON(ret);
7158                 mutex_unlock(&root->fs_info->drop_mutex);
7159
7160                 nr = trans->blocks_used;
7161                 ret = btrfs_end_transaction(trans, root);
7162                 BUG_ON(ret);
7163                 btrfs_btree_balance_dirty(root, nr);
7164
7165                 kfree(prev_root);
7166                 prev_root = reloc_root;
7167         }
7168         if (prev_root) {
7169                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
7170                 kfree(prev_root);
7171         }
7172         return 0;
7173 }
7174
7175 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
7176 {
7177         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
7178         return 0;
7179 }
7180
7181 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7182 {
7183         struct btrfs_root *reloc_root;
7184         struct btrfs_trans_handle *trans;
7185         struct btrfs_key location;
7186         int found;
7187         int ret;
7188
7189         mutex_lock(&root->fs_info->tree_reloc_mutex);
7190         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
7191         BUG_ON(ret);
7192         found = !list_empty(&root->fs_info->dead_reloc_roots);
7193         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7194
7195         if (found) {
7196                 trans = btrfs_start_transaction(root, 1);
7197                 BUG_ON(!trans);
7198                 ret = btrfs_commit_transaction(trans, root);
7199                 BUG_ON(ret);
7200         }
7201
7202         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7203         location.offset = (u64)-1;
7204         location.type = BTRFS_ROOT_ITEM_KEY;
7205
7206         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7207         BUG_ON(!reloc_root);
7208         btrfs_orphan_cleanup(reloc_root);
7209         return 0;
7210 }
7211
7212 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7213                                     struct btrfs_root *root)
7214 {
7215         struct btrfs_root *reloc_root;
7216         struct extent_buffer *eb;
7217         struct btrfs_root_item *root_item;
7218         struct btrfs_key root_key;
7219         int ret;
7220
7221         BUG_ON(!root->ref_cows);
7222         if (root->reloc_root)
7223                 return 0;
7224
7225         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7226         BUG_ON(!root_item);
7227
7228         ret = btrfs_copy_root(trans, root, root->commit_root,
7229                               &eb, BTRFS_TREE_RELOC_OBJECTID);
7230         BUG_ON(ret);
7231
7232         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7233         root_key.offset = root->root_key.objectid;
7234         root_key.type = BTRFS_ROOT_ITEM_KEY;
7235
7236         memcpy(root_item, &root->root_item, sizeof(root_item));
7237         btrfs_set_root_refs(root_item, 0);
7238         btrfs_set_root_bytenr(root_item, eb->start);
7239         btrfs_set_root_level(root_item, btrfs_header_level(eb));
7240         btrfs_set_root_generation(root_item, trans->transid);
7241
7242         btrfs_tree_unlock(eb);
7243         free_extent_buffer(eb);
7244
7245         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
7246                                 &root_key, root_item);
7247         BUG_ON(ret);
7248         kfree(root_item);
7249
7250         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7251                                                  &root_key);
7252         BUG_ON(!reloc_root);
7253         reloc_root->last_trans = trans->transid;
7254         reloc_root->commit_root = NULL;
7255         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
7256
7257         root->reloc_root = reloc_root;
7258         return 0;
7259 }
7260
7261 /*
7262  * Core function of space balance.
7263  *
7264  * The idea is using reloc trees to relocate tree blocks in reference
7265  * counted roots. There is one reloc tree for each subvol, and all
7266  * reloc trees share same root key objectid. Reloc trees are snapshots
7267  * of the latest committed roots of subvols (root->commit_root).
7268  *
7269  * To relocate a tree block referenced by a subvol, there are two steps.
7270  * COW the block through subvol's reloc tree, then update block pointer
7271  * in the subvol to point to the new block. Since all reloc trees share
7272  * same root key objectid, doing special handing for tree blocks owned
7273  * by them is easy. Once a tree block has been COWed in one reloc tree,
7274  * we can use the resulting new block directly when the same block is
7275  * required to COW again through other reloc trees. By this way, relocated
7276  * tree blocks are shared between reloc trees, so they are also shared
7277  * between subvols.
7278  */
7279 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
7280                                       struct btrfs_root *root,
7281                                       struct btrfs_path *path,
7282                                       struct btrfs_key *first_key,
7283                                       struct btrfs_ref_path *ref_path,
7284                                       struct btrfs_block_group_cache *group,
7285                                       struct inode *reloc_inode)
7286 {
7287         struct btrfs_root *reloc_root;
7288         struct extent_buffer *eb = NULL;
7289         struct btrfs_key *keys;
7290         u64 *nodes;
7291         int level;
7292         int shared_level;
7293         int lowest_level = 0;
7294         int ret;
7295
7296         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
7297                 lowest_level = ref_path->owner_objectid;
7298
7299         if (!root->ref_cows) {
7300                 path->lowest_level = lowest_level;
7301                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
7302                 BUG_ON(ret < 0);
7303                 path->lowest_level = 0;
7304                 btrfs_release_path(root, path);
7305                 return 0;
7306         }
7307
7308         mutex_lock(&root->fs_info->tree_reloc_mutex);
7309         ret = init_reloc_tree(trans, root);
7310         BUG_ON(ret);
7311         reloc_root = root->reloc_root;
7312
7313         shared_level = ref_path->shared_level;
7314         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
7315
7316         keys = ref_path->node_keys;
7317         nodes = ref_path->new_nodes;
7318         memset(&keys[shared_level + 1], 0,
7319                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7320         memset(&nodes[shared_level + 1], 0,
7321                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
7322
7323         if (nodes[lowest_level] == 0) {
7324                 path->lowest_level = lowest_level;
7325                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7326                                         0, 1);
7327                 BUG_ON(ret);
7328                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7329                         eb = path->nodes[level];
7330                         if (!eb || eb == reloc_root->node)
7331                                 break;
7332                         nodes[level] = eb->start;
7333                         if (level == 0)
7334                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
7335                         else
7336                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
7337                 }
7338                 if (nodes[0] &&
7339                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7340                         eb = path->nodes[0];
7341                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
7342                                                       group, reloc_inode);
7343                         BUG_ON(ret);
7344                 }
7345                 btrfs_release_path(reloc_root, path);
7346         } else {
7347                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
7348                                        lowest_level);
7349                 BUG_ON(ret);
7350         }
7351
7352         /*
7353          * replace tree blocks in the fs tree with tree blocks in
7354          * the reloc tree.
7355          */
7356         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7357         BUG_ON(ret < 0);
7358
7359         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7360                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7361                                         0, 0);
7362                 BUG_ON(ret);
7363                 extent_buffer_get(path->nodes[0]);
7364                 eb = path->nodes[0];
7365                 btrfs_release_path(reloc_root, path);
7366                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
7367                 BUG_ON(ret);
7368                 free_extent_buffer(eb);
7369         }
7370
7371         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7372         path->lowest_level = 0;
7373         return 0;
7374 }
7375
7376 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
7377                                         struct btrfs_root *root,
7378                                         struct btrfs_path *path,
7379                                         struct btrfs_key *first_key,
7380                                         struct btrfs_ref_path *ref_path)
7381 {
7382         int ret;
7383
7384         ret = relocate_one_path(trans, root, path, first_key,
7385                                 ref_path, NULL, NULL);
7386         BUG_ON(ret);
7387
7388         return 0;
7389 }
7390
7391 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
7392                                     struct btrfs_root *extent_root,
7393                                     struct btrfs_path *path,
7394                                     struct btrfs_key *extent_key)
7395 {
7396         int ret;
7397
7398         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7399         if (ret)
7400                 goto out;
7401         ret = btrfs_del_item(trans, extent_root, path);
7402 out:
7403         btrfs_release_path(extent_root, path);
7404         return ret;
7405 }
7406
7407 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7408                                                 struct btrfs_ref_path *ref_path)
7409 {
7410         struct btrfs_key root_key;
7411
7412         root_key.objectid = ref_path->root_objectid;
7413         root_key.type = BTRFS_ROOT_ITEM_KEY;
7414         if (is_cowonly_root(ref_path->root_objectid))
7415                 root_key.offset = 0;
7416         else
7417                 root_key.offset = (u64)-1;
7418
7419         return btrfs_read_fs_root_no_name(fs_info, &root_key);
7420 }
7421
7422 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7423                                         struct btrfs_path *path,
7424                                         struct btrfs_key *extent_key,
7425                                         struct btrfs_block_group_cache *group,
7426                                         struct inode *reloc_inode, int pass)
7427 {
7428         struct btrfs_trans_handle *trans;
7429         struct btrfs_root *found_root;
7430         struct btrfs_ref_path *ref_path = NULL;
7431         struct disk_extent *new_extents = NULL;
7432         int nr_extents = 0;
7433         int loops;
7434         int ret;
7435         int level;
7436         struct btrfs_key first_key;
7437         u64 prev_block = 0;
7438
7439
7440         trans = btrfs_start_transaction(extent_root, 1);
7441         BUG_ON(!trans);
7442
7443         if (extent_key->objectid == 0) {
7444                 ret = del_extent_zero(trans, extent_root, path, extent_key);
7445                 goto out;
7446         }
7447
7448         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7449         if (!ref_path) {
7450                 ret = -ENOMEM;
7451                 goto out;
7452         }
7453
7454         for (loops = 0; ; loops++) {
7455                 if (loops == 0) {
7456                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7457                                                    extent_key->objectid);
7458                 } else {
7459                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7460                 }
7461                 if (ret < 0)
7462                         goto out;
7463                 if (ret > 0)
7464                         break;
7465
7466                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7467                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7468                         continue;
7469
7470                 found_root = read_ref_root(extent_root->fs_info, ref_path);
7471                 BUG_ON(!found_root);
7472                 /*
7473                  * for reference counted tree, only process reference paths
7474                  * rooted at the latest committed root.
7475                  */
7476                 if (found_root->ref_cows &&
7477                     ref_path->root_generation != found_root->root_key.offset)
7478                         continue;
7479
7480                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7481                         if (pass == 0) {
7482                                 /*
7483                                  * copy data extents to new locations
7484                                  */
7485                                 u64 group_start = group->key.objectid;
7486                                 ret = relocate_data_extent(reloc_inode,
7487                                                            extent_key,
7488                                                            group_start);
7489                                 if (ret < 0)
7490                                         goto out;
7491                                 break;
7492                         }
7493                         level = 0;
7494                 } else {
7495                         level = ref_path->owner_objectid;
7496                 }
7497
7498                 if (prev_block != ref_path->nodes[level]) {
7499                         struct extent_buffer *eb;
7500                         u64 block_start = ref_path->nodes[level];
7501                         u64 block_size = btrfs_level_size(found_root, level);
7502
7503                         eb = read_tree_block(found_root, block_start,
7504                                              block_size, 0);
7505                         btrfs_tree_lock(eb);
7506                         BUG_ON(level != btrfs_header_level(eb));
7507
7508                         if (level == 0)
7509                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
7510                         else
7511                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
7512
7513                         btrfs_tree_unlock(eb);
7514                         free_extent_buffer(eb);
7515                         prev_block = block_start;
7516                 }
7517
7518                 mutex_lock(&extent_root->fs_info->trans_mutex);
7519                 btrfs_record_root_in_trans(found_root);
7520                 mutex_unlock(&extent_root->fs_info->trans_mutex);
7521                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7522                         /*
7523                          * try to update data extent references while
7524                          * keeping metadata shared between snapshots.
7525                          */
7526                         if (pass == 1) {
7527                                 ret = relocate_one_path(trans, found_root,
7528                                                 path, &first_key, ref_path,
7529                                                 group, reloc_inode);
7530                                 if (ret < 0)
7531                                         goto out;
7532                                 continue;
7533                         }
7534                         /*
7535                          * use fallback method to process the remaining
7536                          * references.
7537                          */
7538                         if (!new_extents) {
7539                                 u64 group_start = group->key.objectid;
7540                                 new_extents = kmalloc(sizeof(*new_extents),
7541                                                       GFP_NOFS);
7542                                 nr_extents = 1;
7543                                 ret = get_new_locations(reloc_inode,
7544                                                         extent_key,
7545                                                         group_start, 1,
7546                                                         &new_extents,
7547                                                         &nr_extents);
7548                                 if (ret)
7549                                         goto out;
7550                         }
7551                         ret = replace_one_extent(trans, found_root,
7552                                                 path, extent_key,
7553                                                 &first_key, ref_path,
7554                                                 new_extents, nr_extents);
7555                 } else {
7556                         ret = relocate_tree_block(trans, found_root, path,
7557                                                   &first_key, ref_path);
7558                 }
7559                 if (ret < 0)
7560                         goto out;
7561         }
7562         ret = 0;
7563 out:
7564         btrfs_end_transaction(trans, extent_root);
7565         kfree(new_extents);
7566         kfree(ref_path);
7567         return ret;
7568 }
7569 #endif
7570
7571 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7572 {
7573         u64 num_devices;
7574         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7575                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7576
7577         num_devices = root->fs_info->fs_devices->rw_devices;
7578         if (num_devices == 1) {
7579                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7580                 stripped = flags & ~stripped;
7581
7582                 /* turn raid0 into single device chunks */
7583                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7584                         return stripped;
7585
7586                 /* turn mirroring into duplication */
7587                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7588                              BTRFS_BLOCK_GROUP_RAID10))
7589                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7590                 return flags;
7591         } else {
7592                 /* they already had raid on here, just return */
7593                 if (flags & stripped)
7594                         return flags;
7595
7596                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7597                 stripped = flags & ~stripped;
7598
7599                 /* switch duplicated blocks with raid1 */
7600                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7601                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7602
7603                 /* turn single device chunks into raid0 */
7604                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7605         }
7606         return flags;
7607 }
7608
7609 static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7610 {
7611         struct btrfs_space_info *sinfo = cache->space_info;
7612         u64 num_bytes;
7613         int ret = -ENOSPC;
7614
7615         if (cache->ro)
7616                 return 0;
7617
7618         spin_lock(&sinfo->lock);
7619         spin_lock(&cache->lock);
7620         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7621                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7622
7623         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7624             sinfo->bytes_may_use + sinfo->bytes_readonly +
7625             cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7626                 sinfo->bytes_readonly += num_bytes;
7627                 sinfo->bytes_reserved += cache->reserved_pinned;
7628                 cache->reserved_pinned = 0;
7629                 cache->ro = 1;
7630                 ret = 0;
7631         }
7632         spin_unlock(&cache->lock);
7633         spin_unlock(&sinfo->lock);
7634         return ret;
7635 }
7636
7637 int btrfs_set_block_group_ro(struct btrfs_root *root,
7638                              struct btrfs_block_group_cache *cache)
7639
7640 {
7641         struct btrfs_trans_handle *trans;
7642         u64 alloc_flags;
7643         int ret;
7644
7645         BUG_ON(cache->ro);
7646
7647         trans = btrfs_join_transaction(root, 1);
7648         BUG_ON(IS_ERR(trans));
7649
7650         alloc_flags = update_block_group_flags(root, cache->flags);
7651         if (alloc_flags != cache->flags)
7652                 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7653
7654         ret = set_block_group_ro(cache);
7655         if (!ret)
7656                 goto out;
7657         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7658         ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7659         if (ret < 0)
7660                 goto out;
7661         ret = set_block_group_ro(cache);
7662 out:
7663         btrfs_end_transaction(trans, root);
7664         return ret;
7665 }
7666
7667 int btrfs_set_block_group_rw(struct btrfs_root *root,
7668                               struct btrfs_block_group_cache *cache)
7669 {
7670         struct btrfs_space_info *sinfo = cache->space_info;
7671         u64 num_bytes;
7672
7673         BUG_ON(!cache->ro);
7674
7675         spin_lock(&sinfo->lock);
7676         spin_lock(&cache->lock);
7677         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7678                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7679         sinfo->bytes_readonly -= num_bytes;
7680         cache->ro = 0;
7681         spin_unlock(&cache->lock);
7682         spin_unlock(&sinfo->lock);
7683         return 0;
7684 }
7685
7686 /*
7687  * checks to see if its even possible to relocate this block group.
7688  *
7689  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7690  * ok to go ahead and try.
7691  */
7692 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7693 {
7694         struct btrfs_block_group_cache *block_group;
7695         struct btrfs_space_info *space_info;
7696         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7697         struct btrfs_device *device;
7698         int full = 0;
7699         int ret = 0;
7700
7701         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7702
7703         /* odd, couldn't find the block group, leave it alone */
7704         if (!block_group)
7705                 return -1;
7706
7707         /* no bytes used, we're good */
7708         if (!btrfs_block_group_used(&block_group->item))
7709                 goto out;
7710
7711         space_info = block_group->space_info;
7712         spin_lock(&space_info->lock);
7713
7714         full = space_info->full;
7715
7716         /*
7717          * if this is the last block group we have in this space, we can't
7718          * relocate it unless we're able to allocate a new chunk below.
7719          *
7720          * Otherwise, we need to make sure we have room in the space to handle
7721          * all of the extents from this block group.  If we can, we're good
7722          */
7723         if ((space_info->total_bytes != block_group->key.offset) &&
7724            (space_info->bytes_used + space_info->bytes_reserved +
7725             space_info->bytes_pinned + space_info->bytes_readonly +
7726             btrfs_block_group_used(&block_group->item) <
7727             space_info->total_bytes)) {
7728                 spin_unlock(&space_info->lock);
7729                 goto out;
7730         }
7731         spin_unlock(&space_info->lock);
7732
7733         /*
7734          * ok we don't have enough space, but maybe we have free space on our
7735          * devices to allocate new chunks for relocation, so loop through our
7736          * alloc devices and guess if we have enough space.  However, if we
7737          * were marked as full, then we know there aren't enough chunks, and we
7738          * can just return.
7739          */
7740         ret = -1;
7741         if (full)
7742                 goto out;
7743
7744         mutex_lock(&root->fs_info->chunk_mutex);
7745         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7746                 u64 min_free = btrfs_block_group_used(&block_group->item);
7747                 u64 dev_offset, max_avail;
7748
7749                 /*
7750                  * check to make sure we can actually find a chunk with enough
7751                  * space to fit our block group in.
7752                  */
7753                 if (device->total_bytes > device->bytes_used + min_free) {
7754                         ret = find_free_dev_extent(NULL, device, min_free,
7755                                                    &dev_offset, &max_avail);
7756                         if (!ret)
7757                                 break;
7758                         ret = -1;
7759                 }
7760         }
7761         mutex_unlock(&root->fs_info->chunk_mutex);
7762 out:
7763         btrfs_put_block_group(block_group);
7764         return ret;
7765 }
7766
7767 static int find_first_block_group(struct btrfs_root *root,
7768                 struct btrfs_path *path, struct btrfs_key *key)
7769 {
7770         int ret = 0;
7771         struct btrfs_key found_key;
7772         struct extent_buffer *leaf;
7773         int slot;
7774
7775         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7776         if (ret < 0)
7777                 goto out;
7778
7779         while (1) {
7780                 slot = path->slots[0];
7781                 leaf = path->nodes[0];
7782                 if (slot >= btrfs_header_nritems(leaf)) {
7783                         ret = btrfs_next_leaf(root, path);
7784                         if (ret == 0)
7785                                 continue;
7786                         if (ret < 0)
7787                                 goto out;
7788                         break;
7789                 }
7790                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7791
7792                 if (found_key.objectid >= key->objectid &&
7793                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7794                         ret = 0;
7795                         goto out;
7796                 }
7797                 path->slots[0]++;
7798         }
7799 out:
7800         return ret;
7801 }
7802
7803 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7804 {
7805         struct btrfs_block_group_cache *block_group;
7806         struct btrfs_space_info *space_info;
7807         struct btrfs_caching_control *caching_ctl;
7808         struct rb_node *n;
7809
7810         down_write(&info->extent_commit_sem);
7811         while (!list_empty(&info->caching_block_groups)) {
7812                 caching_ctl = list_entry(info->caching_block_groups.next,
7813                                          struct btrfs_caching_control, list);
7814                 list_del(&caching_ctl->list);
7815                 put_caching_control(caching_ctl);
7816         }
7817         up_write(&info->extent_commit_sem);
7818
7819         spin_lock(&info->block_group_cache_lock);
7820         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7821                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7822                                        cache_node);
7823                 rb_erase(&block_group->cache_node,
7824                          &info->block_group_cache_tree);
7825                 spin_unlock(&info->block_group_cache_lock);
7826
7827                 down_write(&block_group->space_info->groups_sem);
7828                 list_del(&block_group->list);
7829                 up_write(&block_group->space_info->groups_sem);
7830
7831                 if (block_group->cached == BTRFS_CACHE_STARTED)
7832                         wait_block_group_cache_done(block_group);
7833
7834                 btrfs_remove_free_space_cache(block_group);
7835                 btrfs_put_block_group(block_group);
7836
7837                 spin_lock(&info->block_group_cache_lock);
7838         }
7839         spin_unlock(&info->block_group_cache_lock);
7840
7841         /* now that all the block groups are freed, go through and
7842          * free all the space_info structs.  This is only called during
7843          * the final stages of unmount, and so we know nobody is
7844          * using them.  We call synchronize_rcu() once before we start,
7845          * just to be on the safe side.
7846          */
7847         synchronize_rcu();
7848
7849         while(!list_empty(&info->space_info)) {
7850                 space_info = list_entry(info->space_info.next,
7851                                         struct btrfs_space_info,
7852                                         list);
7853                 if (space_info->bytes_pinned > 0 ||
7854                     space_info->bytes_reserved > 0) {
7855                         WARN_ON(1);
7856                         dump_space_info(space_info, 0, 0);
7857                 }
7858                 list_del(&space_info->list);
7859                 kfree(space_info);
7860         }
7861         return 0;
7862 }
7863
7864 static void __link_block_group(struct btrfs_space_info *space_info,
7865                                struct btrfs_block_group_cache *cache)
7866 {
7867         int index = get_block_group_index(cache);
7868
7869         down_write(&space_info->groups_sem);
7870         list_add_tail(&cache->list, &space_info->block_groups[index]);
7871         up_write(&space_info->groups_sem);
7872 }
7873
7874 int btrfs_read_block_groups(struct btrfs_root *root)
7875 {
7876         struct btrfs_path *path;
7877         int ret;
7878         struct btrfs_block_group_cache *cache;
7879         struct btrfs_fs_info *info = root->fs_info;
7880         struct btrfs_space_info *space_info;
7881         struct btrfs_key key;
7882         struct btrfs_key found_key;
7883         struct extent_buffer *leaf;
7884
7885         root = info->extent_root;
7886         key.objectid = 0;
7887         key.offset = 0;
7888         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7889         path = btrfs_alloc_path();
7890         if (!path)
7891                 return -ENOMEM;
7892
7893         while (1) {
7894                 ret = find_first_block_group(root, path, &key);
7895                 if (ret > 0)
7896                         break;
7897                 if (ret != 0)
7898                         goto error;
7899
7900                 leaf = path->nodes[0];
7901                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7902                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7903                 if (!cache) {
7904                         ret = -ENOMEM;
7905                         goto error;
7906                 }
7907
7908                 atomic_set(&cache->count, 1);
7909                 spin_lock_init(&cache->lock);
7910                 spin_lock_init(&cache->tree_lock);
7911                 cache->fs_info = info;
7912                 INIT_LIST_HEAD(&cache->list);
7913                 INIT_LIST_HEAD(&cache->cluster_list);
7914
7915                 /*
7916                  * we only want to have 32k of ram per block group for keeping
7917                  * track of free space, and if we pass 1/2 of that we want to
7918                  * start converting things over to using bitmaps
7919                  */
7920                 cache->extents_thresh = ((1024 * 32) / 2) /
7921                         sizeof(struct btrfs_free_space);
7922
7923                 read_extent_buffer(leaf, &cache->item,
7924                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7925                                    sizeof(cache->item));
7926                 memcpy(&cache->key, &found_key, sizeof(found_key));
7927
7928                 key.objectid = found_key.objectid + found_key.offset;
7929                 btrfs_release_path(root, path);
7930                 cache->flags = btrfs_block_group_flags(&cache->item);
7931                 cache->sectorsize = root->sectorsize;
7932
7933                 /*
7934                  * check for two cases, either we are full, and therefore
7935                  * don't need to bother with the caching work since we won't
7936                  * find any space, or we are empty, and we can just add all
7937                  * the space in and be done with it.  This saves us _alot_ of
7938                  * time, particularly in the full case.
7939                  */
7940                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7941                         exclude_super_stripes(root, cache);
7942                         cache->last_byte_to_unpin = (u64)-1;
7943                         cache->cached = BTRFS_CACHE_FINISHED;
7944                         free_excluded_extents(root, cache);
7945                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7946                         exclude_super_stripes(root, cache);
7947                         cache->last_byte_to_unpin = (u64)-1;
7948                         cache->cached = BTRFS_CACHE_FINISHED;
7949                         add_new_free_space(cache, root->fs_info,
7950                                            found_key.objectid,
7951                                            found_key.objectid +
7952                                            found_key.offset);
7953                         free_excluded_extents(root, cache);
7954                 }
7955
7956                 ret = update_space_info(info, cache->flags, found_key.offset,
7957                                         btrfs_block_group_used(&cache->item),
7958                                         &space_info);
7959                 BUG_ON(ret);
7960                 cache->space_info = space_info;
7961                 spin_lock(&cache->space_info->lock);
7962                 cache->space_info->bytes_readonly += cache->bytes_super;
7963                 spin_unlock(&cache->space_info->lock);
7964
7965                 __link_block_group(space_info, cache);
7966
7967                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7968                 BUG_ON(ret);
7969
7970                 set_avail_alloc_bits(root->fs_info, cache->flags);
7971                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7972                         set_block_group_ro(cache);
7973         }
7974
7975         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7976                 if (!(get_alloc_profile(root, space_info->flags) &
7977                       (BTRFS_BLOCK_GROUP_RAID10 |
7978                        BTRFS_BLOCK_GROUP_RAID1 |
7979                        BTRFS_BLOCK_GROUP_DUP)))
7980                         continue;
7981                 /*
7982                  * avoid allocating from un-mirrored block group if there are
7983                  * mirrored block groups.
7984                  */
7985                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7986                         set_block_group_ro(cache);
7987                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7988                         set_block_group_ro(cache);
7989         }
7990
7991         init_global_block_rsv(info);
7992         ret = 0;
7993 error:
7994         btrfs_free_path(path);
7995         return ret;
7996 }
7997
7998 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7999                            struct btrfs_root *root, u64 bytes_used,
8000                            u64 type, u64 chunk_objectid, u64 chunk_offset,
8001                            u64 size)
8002 {
8003         int ret;
8004         struct btrfs_root *extent_root;
8005         struct btrfs_block_group_cache *cache;
8006
8007         extent_root = root->fs_info->extent_root;
8008
8009         root->fs_info->last_trans_log_full_commit = trans->transid;
8010
8011         cache = kzalloc(sizeof(*cache), GFP_NOFS);
8012         if (!cache)
8013                 return -ENOMEM;
8014
8015         cache->key.objectid = chunk_offset;
8016         cache->key.offset = size;
8017         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8018         cache->sectorsize = root->sectorsize;
8019
8020         /*
8021          * we only want to have 32k of ram per block group for keeping track
8022          * of free space, and if we pass 1/2 of that we want to start
8023          * converting things over to using bitmaps
8024          */
8025         cache->extents_thresh = ((1024 * 32) / 2) /
8026                 sizeof(struct btrfs_free_space);
8027         atomic_set(&cache->count, 1);
8028         spin_lock_init(&cache->lock);
8029         spin_lock_init(&cache->tree_lock);
8030         INIT_LIST_HEAD(&cache->list);
8031         INIT_LIST_HEAD(&cache->cluster_list);
8032
8033         btrfs_set_block_group_used(&cache->item, bytes_used);
8034         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8035         cache->flags = type;
8036         btrfs_set_block_group_flags(&cache->item, type);
8037
8038         cache->last_byte_to_unpin = (u64)-1;
8039         cache->cached = BTRFS_CACHE_FINISHED;
8040         exclude_super_stripes(root, cache);
8041
8042         add_new_free_space(cache, root->fs_info, chunk_offset,
8043                            chunk_offset + size);
8044
8045         free_excluded_extents(root, cache);
8046
8047         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8048                                 &cache->space_info);
8049         BUG_ON(ret);
8050
8051         spin_lock(&cache->space_info->lock);
8052         cache->space_info->bytes_readonly += cache->bytes_super;
8053         spin_unlock(&cache->space_info->lock);
8054
8055         __link_block_group(cache->space_info, cache);
8056
8057         ret = btrfs_add_block_group_cache(root->fs_info, cache);
8058         BUG_ON(ret);
8059
8060         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
8061                                 sizeof(cache->item));
8062         BUG_ON(ret);
8063
8064         set_avail_alloc_bits(extent_root->fs_info, type);
8065
8066         return 0;
8067 }
8068
8069 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8070                              struct btrfs_root *root, u64 group_start)
8071 {
8072         struct btrfs_path *path;
8073         struct btrfs_block_group_cache *block_group;
8074         struct btrfs_free_cluster *cluster;
8075         struct btrfs_key key;
8076         int ret;
8077
8078         root = root->fs_info->extent_root;
8079
8080         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8081         BUG_ON(!block_group);
8082         BUG_ON(!block_group->ro);
8083
8084         memcpy(&key, &block_group->key, sizeof(key));
8085
8086         /* make sure this block group isn't part of an allocation cluster */
8087         cluster = &root->fs_info->data_alloc_cluster;
8088         spin_lock(&cluster->refill_lock);
8089         btrfs_return_cluster_to_free_space(block_group, cluster);
8090         spin_unlock(&cluster->refill_lock);
8091
8092         /*
8093          * make sure this block group isn't part of a metadata
8094          * allocation cluster
8095          */
8096         cluster = &root->fs_info->meta_alloc_cluster;
8097         spin_lock(&cluster->refill_lock);
8098         btrfs_return_cluster_to_free_space(block_group, cluster);
8099         spin_unlock(&cluster->refill_lock);
8100
8101         path = btrfs_alloc_path();
8102         BUG_ON(!path);
8103
8104         spin_lock(&root->fs_info->block_group_cache_lock);
8105         rb_erase(&block_group->cache_node,
8106                  &root->fs_info->block_group_cache_tree);
8107         spin_unlock(&root->fs_info->block_group_cache_lock);
8108
8109         down_write(&block_group->space_info->groups_sem);
8110         /*
8111          * we must use list_del_init so people can check to see if they
8112          * are still on the list after taking the semaphore
8113          */
8114         list_del_init(&block_group->list);
8115         up_write(&block_group->space_info->groups_sem);
8116
8117         if (block_group->cached == BTRFS_CACHE_STARTED)
8118                 wait_block_group_cache_done(block_group);
8119
8120         btrfs_remove_free_space_cache(block_group);
8121
8122         spin_lock(&block_group->space_info->lock);
8123         block_group->space_info->total_bytes -= block_group->key.offset;
8124         block_group->space_info->bytes_readonly -= block_group->key.offset;
8125         spin_unlock(&block_group->space_info->lock);
8126
8127         btrfs_clear_space_info_full(root->fs_info);
8128
8129         btrfs_put_block_group(block_group);
8130         btrfs_put_block_group(block_group);
8131
8132         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8133         if (ret > 0)
8134                 ret = -EIO;
8135         if (ret < 0)
8136                 goto out;
8137
8138         ret = btrfs_del_item(trans, root, path);
8139 out:
8140         btrfs_free_path(path);
8141         return ret;
8142 }