]> bbs.cooldavid.org Git - net-next-2.6.git/blame - fs/btrfs/extent-tree.c
Btrfs: Link block groups of different raid types
[net-next-2.6.git] / fs / btrfs / extent-tree.c
CommitLineData
6cbd5570
CM
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
ec6b910f 18#include <linux/sched.h>
edbd8d4e 19#include <linux/pagemap.h>
ec44a35c 20#include <linux/writeback.h>
21af804c 21#include <linux/blkdev.h>
b7a9f29f 22#include <linux/sort.h>
4184ea7f 23#include <linux/rcupdate.h>
817d52f8 24#include <linux/kthread.h>
5a0e3ad6 25#include <linux/slab.h>
4b4e25f2 26#include "compat.h"
74493f7a 27#include "hash.h"
fec577fb
CM
28#include "ctree.h"
29#include "disk-io.h"
30#include "print-tree.h"
e089f05c 31#include "transaction.h"
0b86a832 32#include "volumes.h"
925baedd 33#include "locking.h"
fa9c0d79 34#include "free-space-cache.h"
fec577fb 35
f3465ca4
JB
36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc,
39 int mark_free);
11833d66
YZ
40static int update_reserved_extents(struct btrfs_block_group_cache *cache,
41 u64 num_bytes, int reserve);
5d4f98a2
YZ
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent,
45 u64 root_objectid, u64 owner_objectid,
46 u64 owner_offset, int refs_to_drop,
47 struct btrfs_delayed_extent_op *extra_op);
48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
49 struct extent_buffer *leaf,
50 struct btrfs_extent_item *ei);
51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root,
53 u64 parent, u64 root_objectid,
54 u64 flags, u64 owner, u64 offset,
55 struct btrfs_key *ins, int ref_mod);
56static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 parent, u64 root_objectid,
59 u64 flags, struct btrfs_disk_key *key,
60 int level, struct btrfs_key *ins);
6a63209f
JB
61static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force);
11833d66
YZ
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key);
9ed74f2d
JB
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
73 int dump_block_groups);
6a63209f 74
817d52f8
JB
75static noinline int
76block_group_cache_done(struct btrfs_block_group_cache *cache)
77{
78 smp_mb();
79 return cache->cached == BTRFS_CACHE_FINISHED;
80}
81
0f9dd46c
JB
82static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
83{
84 return (cache->flags & bits) == bits;
85}
86
11dfe35a
JB
87void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
88{
89 atomic_inc(&cache->count);
90}
91
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{
94 if (atomic_dec_and_test(&cache->count))
95 kfree(cache);
96}
97
0f9dd46c
JB
98/*
99 * this adds the block group to the fs_info rb tree for the block group
100 * cache
101 */
b2950863 102static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
0f9dd46c
JB
103 struct btrfs_block_group_cache *block_group)
104{
105 struct rb_node **p;
106 struct rb_node *parent = NULL;
107 struct btrfs_block_group_cache *cache;
108
109 spin_lock(&info->block_group_cache_lock);
110 p = &info->block_group_cache_tree.rb_node;
111
112 while (*p) {
113 parent = *p;
114 cache = rb_entry(parent, struct btrfs_block_group_cache,
115 cache_node);
116 if (block_group->key.objectid < cache->key.objectid) {
117 p = &(*p)->rb_left;
118 } else if (block_group->key.objectid > cache->key.objectid) {
119 p = &(*p)->rb_right;
120 } else {
121 spin_unlock(&info->block_group_cache_lock);
122 return -EEXIST;
123 }
124 }
125
126 rb_link_node(&block_group->cache_node, parent, p);
127 rb_insert_color(&block_group->cache_node,
128 &info->block_group_cache_tree);
129 spin_unlock(&info->block_group_cache_lock);
130
131 return 0;
132}
133
134/*
135 * This will return the block group at or after bytenr if contains is 0, else
136 * it will return the block group that contains the bytenr
137 */
138static struct btrfs_block_group_cache *
139block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
140 int contains)
141{
142 struct btrfs_block_group_cache *cache, *ret = NULL;
143 struct rb_node *n;
144 u64 end, start;
145
146 spin_lock(&info->block_group_cache_lock);
147 n = info->block_group_cache_tree.rb_node;
148
149 while (n) {
150 cache = rb_entry(n, struct btrfs_block_group_cache,
151 cache_node);
152 end = cache->key.objectid + cache->key.offset - 1;
153 start = cache->key.objectid;
154
155 if (bytenr < start) {
156 if (!contains && (!ret || start < ret->key.objectid))
157 ret = cache;
158 n = n->rb_left;
159 } else if (bytenr > start) {
160 if (contains && bytenr <= end) {
161 ret = cache;
162 break;
163 }
164 n = n->rb_right;
165 } else {
166 ret = cache;
167 break;
168 }
169 }
d2fb3437 170 if (ret)
11dfe35a 171 btrfs_get_block_group(ret);
0f9dd46c
JB
172 spin_unlock(&info->block_group_cache_lock);
173
174 return ret;
175}
176
11833d66
YZ
177static int add_excluded_extent(struct btrfs_root *root,
178 u64 start, u64 num_bytes)
817d52f8 179{
11833d66
YZ
180 u64 end = start + num_bytes - 1;
181 set_extent_bits(&root->fs_info->freed_extents[0],
182 start, end, EXTENT_UPTODATE, GFP_NOFS);
183 set_extent_bits(&root->fs_info->freed_extents[1],
184 start, end, EXTENT_UPTODATE, GFP_NOFS);
185 return 0;
186}
817d52f8 187
11833d66
YZ
188static void free_excluded_extents(struct btrfs_root *root,
189 struct btrfs_block_group_cache *cache)
190{
191 u64 start, end;
817d52f8 192
11833d66
YZ
193 start = cache->key.objectid;
194 end = start + cache->key.offset - 1;
195
196 clear_extent_bits(&root->fs_info->freed_extents[0],
197 start, end, EXTENT_UPTODATE, GFP_NOFS);
198 clear_extent_bits(&root->fs_info->freed_extents[1],
199 start, end, EXTENT_UPTODATE, GFP_NOFS);
817d52f8
JB
200}
201
11833d66
YZ
202static int exclude_super_stripes(struct btrfs_root *root,
203 struct btrfs_block_group_cache *cache)
817d52f8 204{
817d52f8
JB
205 u64 bytenr;
206 u64 *logical;
207 int stripe_len;
208 int i, nr, ret;
209
06b2331f
YZ
210 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
211 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
212 cache->bytes_super += stripe_len;
213 ret = add_excluded_extent(root, cache->key.objectid,
214 stripe_len);
215 BUG_ON(ret);
216 }
217
817d52f8
JB
218 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
219 bytenr = btrfs_sb_offset(i);
220 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
221 cache->key.objectid, bytenr,
222 0, &logical, &nr, &stripe_len);
223 BUG_ON(ret);
11833d66 224
817d52f8 225 while (nr--) {
1b2da372 226 cache->bytes_super += stripe_len;
11833d66
YZ
227 ret = add_excluded_extent(root, logical[nr],
228 stripe_len);
229 BUG_ON(ret);
817d52f8 230 }
11833d66 231
817d52f8
JB
232 kfree(logical);
233 }
817d52f8
JB
234 return 0;
235}
236
11833d66
YZ
237static struct btrfs_caching_control *
238get_caching_control(struct btrfs_block_group_cache *cache)
239{
240 struct btrfs_caching_control *ctl;
241
242 spin_lock(&cache->lock);
243 if (cache->cached != BTRFS_CACHE_STARTED) {
244 spin_unlock(&cache->lock);
245 return NULL;
246 }
247
248 ctl = cache->caching_ctl;
249 atomic_inc(&ctl->count);
250 spin_unlock(&cache->lock);
251 return ctl;
252}
253
254static void put_caching_control(struct btrfs_caching_control *ctl)
255{
256 if (atomic_dec_and_test(&ctl->count))
257 kfree(ctl);
258}
259
0f9dd46c
JB
260/*
261 * this is only called by cache_block_group, since we could have freed extents
262 * we need to check the pinned_extents for any extents that can't be used yet
263 * since their free space will be released as soon as the transaction commits.
264 */
817d52f8 265static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
0f9dd46c
JB
266 struct btrfs_fs_info *info, u64 start, u64 end)
267{
817d52f8 268 u64 extent_start, extent_end, size, total_added = 0;
0f9dd46c
JB
269 int ret;
270
271 while (start < end) {
11833d66 272 ret = find_first_extent_bit(info->pinned_extents, start,
0f9dd46c 273 &extent_start, &extent_end,
11833d66 274 EXTENT_DIRTY | EXTENT_UPTODATE);
0f9dd46c
JB
275 if (ret)
276 break;
277
06b2331f 278 if (extent_start <= start) {
0f9dd46c
JB
279 start = extent_end + 1;
280 } else if (extent_start > start && extent_start < end) {
281 size = extent_start - start;
817d52f8 282 total_added += size;
ea6a478e
JB
283 ret = btrfs_add_free_space(block_group, start,
284 size);
0f9dd46c
JB
285 BUG_ON(ret);
286 start = extent_end + 1;
287 } else {
288 break;
289 }
290 }
291
292 if (start < end) {
293 size = end - start;
817d52f8 294 total_added += size;
ea6a478e 295 ret = btrfs_add_free_space(block_group, start, size);
0f9dd46c
JB
296 BUG_ON(ret);
297 }
298
817d52f8 299 return total_added;
0f9dd46c
JB
300}
301
817d52f8 302static int caching_kthread(void *data)
e37c9e69 303{
817d52f8
JB
304 struct btrfs_block_group_cache *block_group = data;
305 struct btrfs_fs_info *fs_info = block_group->fs_info;
11833d66
YZ
306 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
307 struct btrfs_root *extent_root = fs_info->extent_root;
e37c9e69 308 struct btrfs_path *path;
5f39d397 309 struct extent_buffer *leaf;
11833d66 310 struct btrfs_key key;
817d52f8 311 u64 total_found = 0;
11833d66
YZ
312 u64 last = 0;
313 u32 nritems;
314 int ret = 0;
f510cfec 315
e37c9e69
CM
316 path = btrfs_alloc_path();
317 if (!path)
318 return -ENOMEM;
7d7d6068 319
11833d66 320 exclude_super_stripes(extent_root, block_group);
1b2da372
JB
321 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock);
11833d66 324
817d52f8 325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
11833d66 326
5cd57b2c 327 /*
817d52f8
JB
328 * We don't want to deadlock with somebody trying to allocate a new
329 * extent for the extent root while also trying to search the extent
330 * root to add free space. So we skip locking and search the commit
331 * root, since its read-only
5cd57b2c
CM
332 */
333 path->skip_locking = 1;
817d52f8
JB
334 path->search_commit_root = 1;
335 path->reada = 2;
336
e4404d6e 337 key.objectid = last;
e37c9e69 338 key.offset = 0;
11833d66 339 key.type = BTRFS_EXTENT_ITEM_KEY;
013f1b12 340again:
11833d66 341 mutex_lock(&caching_ctl->mutex);
013f1b12
CM
342 /* need to make sure the commit_root doesn't disappear */
343 down_read(&fs_info->extent_commit_sem);
344
11833d66 345 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
e37c9e69 346 if (ret < 0)
ef8bbdfe 347 goto err;
a512bbf8 348
11833d66
YZ
349 leaf = path->nodes[0];
350 nritems = btrfs_header_nritems(leaf);
351
d397712b 352 while (1) {
817d52f8 353 smp_mb();
11833d66 354 if (fs_info->closing > 1) {
f25784b3 355 last = (u64)-1;
817d52f8 356 break;
f25784b3 357 }
817d52f8 358
11833d66
YZ
359 if (path->slots[0] < nritems) {
360 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
361 } else {
362 ret = find_next_key(path, 0, &key);
363 if (ret)
e37c9e69 364 break;
817d52f8 365
11833d66
YZ
366 caching_ctl->progress = last;
367 btrfs_release_path(extent_root, path);
368 up_read(&fs_info->extent_commit_sem);
369 mutex_unlock(&caching_ctl->mutex);
370 if (btrfs_transaction_in_commit(fs_info))
f36f3042 371 schedule_timeout(1);
11833d66
YZ
372 else
373 cond_resched();
374 goto again;
375 }
817d52f8 376
11833d66
YZ
377 if (key.objectid < block_group->key.objectid) {
378 path->slots[0]++;
817d52f8 379 continue;
e37c9e69 380 }
0f9dd46c 381
e37c9e69 382 if (key.objectid >= block_group->key.objectid +
0f9dd46c 383 block_group->key.offset)
e37c9e69 384 break;
7d7d6068 385
11833d66 386 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
817d52f8
JB
387 total_found += add_new_free_space(block_group,
388 fs_info, last,
389 key.objectid);
7d7d6068 390 last = key.objectid + key.offset;
817d52f8 391
11833d66
YZ
392 if (total_found > (1024 * 1024 * 2)) {
393 total_found = 0;
394 wake_up(&caching_ctl->wait);
395 }
817d52f8 396 }
e37c9e69
CM
397 path->slots[0]++;
398 }
817d52f8 399 ret = 0;
e37c9e69 400
817d52f8
JB
401 total_found += add_new_free_space(block_group, fs_info, last,
402 block_group->key.objectid +
403 block_group->key.offset);
11833d66 404 caching_ctl->progress = (u64)-1;
817d52f8
JB
405
406 spin_lock(&block_group->lock);
11833d66 407 block_group->caching_ctl = NULL;
817d52f8
JB
408 block_group->cached = BTRFS_CACHE_FINISHED;
409 spin_unlock(&block_group->lock);
0f9dd46c 410
54aa1f4d 411err:
e37c9e69 412 btrfs_free_path(path);
276e680d 413 up_read(&fs_info->extent_commit_sem);
817d52f8 414
11833d66
YZ
415 free_excluded_extents(extent_root, block_group);
416
417 mutex_unlock(&caching_ctl->mutex);
418 wake_up(&caching_ctl->wait);
419
420 put_caching_control(caching_ctl);
421 atomic_dec(&block_group->space_info->caching_threads);
11dfe35a
JB
422 btrfs_put_block_group(block_group);
423
817d52f8
JB
424 return 0;
425}
426
427static int cache_block_group(struct btrfs_block_group_cache *cache)
428{
11833d66
YZ
429 struct btrfs_fs_info *fs_info = cache->fs_info;
430 struct btrfs_caching_control *caching_ctl;
817d52f8
JB
431 struct task_struct *tsk;
432 int ret = 0;
433
11833d66
YZ
434 smp_mb();
435 if (cache->cached != BTRFS_CACHE_NO)
436 return 0;
437
438 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
439 BUG_ON(!caching_ctl);
440
441 INIT_LIST_HEAD(&caching_ctl->list);
442 mutex_init(&caching_ctl->mutex);
443 init_waitqueue_head(&caching_ctl->wait);
444 caching_ctl->block_group = cache;
445 caching_ctl->progress = cache->key.objectid;
446 /* one for caching kthread, one for caching block group list */
447 atomic_set(&caching_ctl->count, 2);
448
817d52f8
JB
449 spin_lock(&cache->lock);
450 if (cache->cached != BTRFS_CACHE_NO) {
451 spin_unlock(&cache->lock);
11833d66
YZ
452 kfree(caching_ctl);
453 return 0;
817d52f8 454 }
11833d66 455 cache->caching_ctl = caching_ctl;
817d52f8
JB
456 cache->cached = BTRFS_CACHE_STARTED;
457 spin_unlock(&cache->lock);
458
11833d66
YZ
459 down_write(&fs_info->extent_commit_sem);
460 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
461 up_write(&fs_info->extent_commit_sem);
462
463 atomic_inc(&cache->space_info->caching_threads);
11dfe35a 464 btrfs_get_block_group(cache);
11833d66 465
817d52f8
JB
466 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
467 cache->key.objectid);
468 if (IS_ERR(tsk)) {
469 ret = PTR_ERR(tsk);
470 printk(KERN_ERR "error running thread %d\n", ret);
471 BUG();
472 }
473
ef8bbdfe 474 return ret;
e37c9e69
CM
475}
476
0f9dd46c
JB
477/*
478 * return the block group that starts at or after bytenr
479 */
d397712b
CM
480static struct btrfs_block_group_cache *
481btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
0ef3e66b 482{
0f9dd46c 483 struct btrfs_block_group_cache *cache;
0ef3e66b 484
0f9dd46c 485 cache = block_group_cache_tree_search(info, bytenr, 0);
0ef3e66b 486
0f9dd46c 487 return cache;
0ef3e66b
CM
488}
489
0f9dd46c 490/*
9f55684c 491 * return the block group that contains the given bytenr
0f9dd46c 492 */
d397712b
CM
493struct btrfs_block_group_cache *btrfs_lookup_block_group(
494 struct btrfs_fs_info *info,
495 u64 bytenr)
be744175 496{
0f9dd46c 497 struct btrfs_block_group_cache *cache;
be744175 498
0f9dd46c 499 cache = block_group_cache_tree_search(info, bytenr, 1);
96b5179d 500
0f9dd46c 501 return cache;
be744175 502}
0b86a832 503
0f9dd46c
JB
504static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
505 u64 flags)
6324fbf3 506{
0f9dd46c 507 struct list_head *head = &info->space_info;
0f9dd46c 508 struct btrfs_space_info *found;
4184ea7f 509
b742bb82
YZ
510 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
511 BTRFS_BLOCK_GROUP_METADATA;
512
4184ea7f
CM
513 rcu_read_lock();
514 list_for_each_entry_rcu(found, head, list) {
515 if (found->flags == flags) {
516 rcu_read_unlock();
0f9dd46c 517 return found;
4184ea7f 518 }
0f9dd46c 519 }
4184ea7f 520 rcu_read_unlock();
0f9dd46c 521 return NULL;
6324fbf3
CM
522}
523
4184ea7f
CM
524/*
525 * after adding space to the filesystem, we need to clear the full flags
526 * on all the space infos.
527 */
528void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
529{
530 struct list_head *head = &info->space_info;
531 struct btrfs_space_info *found;
532
533 rcu_read_lock();
534 list_for_each_entry_rcu(found, head, list)
535 found->full = 0;
536 rcu_read_unlock();
537}
538
80eb234a
JB
539static u64 div_factor(u64 num, int factor)
540{
541 if (factor == 10)
542 return num;
543 num *= factor;
544 do_div(num, 10);
545 return num;
546}
547
d2fb3437
YZ
548u64 btrfs_find_block_group(struct btrfs_root *root,
549 u64 search_start, u64 search_hint, int owner)
cd1bc465 550{
96b5179d 551 struct btrfs_block_group_cache *cache;
cd1bc465 552 u64 used;
d2fb3437
YZ
553 u64 last = max(search_hint, search_start);
554 u64 group_start = 0;
31f3c99b 555 int full_search = 0;
d2fb3437 556 int factor = 9;
0ef3e66b 557 int wrapped = 0;
31f3c99b 558again:
e8569813
ZY
559 while (1) {
560 cache = btrfs_lookup_first_block_group(root->fs_info, last);
0f9dd46c
JB
561 if (!cache)
562 break;
96b5179d 563
c286ac48 564 spin_lock(&cache->lock);
96b5179d
CM
565 last = cache->key.objectid + cache->key.offset;
566 used = btrfs_block_group_used(&cache->item);
567
d2fb3437
YZ
568 if ((full_search || !cache->ro) &&
569 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
e8569813 570 if (used + cache->pinned + cache->reserved <
d2fb3437
YZ
571 div_factor(cache->key.offset, factor)) {
572 group_start = cache->key.objectid;
c286ac48 573 spin_unlock(&cache->lock);
fa9c0d79 574 btrfs_put_block_group(cache);
8790d502
CM
575 goto found;
576 }
6324fbf3 577 }
c286ac48 578 spin_unlock(&cache->lock);
fa9c0d79 579 btrfs_put_block_group(cache);
de428b63 580 cond_resched();
cd1bc465 581 }
0ef3e66b
CM
582 if (!wrapped) {
583 last = search_start;
584 wrapped = 1;
585 goto again;
586 }
587 if (!full_search && factor < 10) {
be744175 588 last = search_start;
31f3c99b 589 full_search = 1;
0ef3e66b 590 factor = 10;
31f3c99b
CM
591 goto again;
592 }
be744175 593found:
d2fb3437 594 return group_start;
925baedd 595}
0f9dd46c 596
e02119d5 597/* simple helper to search for an existing extent at a given offset */
31840ae1 598int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
e02119d5
CM
599{
600 int ret;
601 struct btrfs_key key;
31840ae1 602 struct btrfs_path *path;
e02119d5 603
31840ae1
ZY
604 path = btrfs_alloc_path();
605 BUG_ON(!path);
e02119d5
CM
606 key.objectid = start;
607 key.offset = len;
608 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
609 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
610 0, 0);
31840ae1 611 btrfs_free_path(path);
7bb86316
CM
612 return ret;
613}
614
d8d5f3e1
CM
615/*
616 * Back reference rules. Back refs have three main goals:
617 *
618 * 1) differentiate between all holders of references to an extent so that
619 * when a reference is dropped we can make sure it was a valid reference
620 * before freeing the extent.
621 *
622 * 2) Provide enough information to quickly find the holders of an extent
623 * if we notice a given block is corrupted or bad.
624 *
625 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
626 * maintenance. This is actually the same as #2, but with a slightly
627 * different use case.
628 *
5d4f98a2
YZ
629 * There are two kinds of back refs. The implicit back refs is optimized
630 * for pointers in non-shared tree blocks. For a given pointer in a block,
631 * back refs of this kind provide information about the block's owner tree
632 * and the pointer's key. These information allow us to find the block by
633 * b-tree searching. The full back refs is for pointers in tree blocks not
634 * referenced by their owner trees. The location of tree block is recorded
635 * in the back refs. Actually the full back refs is generic, and can be
636 * used in all cases the implicit back refs is used. The major shortcoming
637 * of the full back refs is its overhead. Every time a tree block gets
638 * COWed, we have to update back refs entry for all pointers in it.
639 *
640 * For a newly allocated tree block, we use implicit back refs for
641 * pointers in it. This means most tree related operations only involve
642 * implicit back refs. For a tree block created in old transaction, the
643 * only way to drop a reference to it is COW it. So we can detect the
644 * event that tree block loses its owner tree's reference and do the
645 * back refs conversion.
646 *
647 * When a tree block is COW'd through a tree, there are four cases:
648 *
649 * The reference count of the block is one and the tree is the block's
650 * owner tree. Nothing to do in this case.
651 *
652 * The reference count of the block is one and the tree is not the
653 * block's owner tree. In this case, full back refs is used for pointers
654 * in the block. Remove these full back refs, add implicit back refs for
655 * every pointers in the new block.
656 *
657 * The reference count of the block is greater than one and the tree is
658 * the block's owner tree. In this case, implicit back refs is used for
659 * pointers in the block. Add full back refs for every pointers in the
660 * block, increase lower level extents' reference counts. The original
661 * implicit back refs are entailed to the new block.
662 *
663 * The reference count of the block is greater than one and the tree is
664 * not the block's owner tree. Add implicit back refs for every pointer in
665 * the new block, increase lower level extents' reference count.
666 *
667 * Back Reference Key composing:
668 *
669 * The key objectid corresponds to the first byte in the extent,
670 * The key type is used to differentiate between types of back refs.
671 * There are different meanings of the key offset for different types
672 * of back refs.
673 *
d8d5f3e1
CM
674 * File extents can be referenced by:
675 *
676 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 677 * - different files inside a single subvolume
d8d5f3e1
CM
678 * - different offsets inside a file (bookend extents in file.c)
679 *
5d4f98a2 680 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
681 *
682 * - Objectid of the subvolume root
d8d5f3e1 683 * - objectid of the file holding the reference
5d4f98a2
YZ
684 * - original offset in the file
685 * - how many bookend extents
d8d5f3e1 686 *
5d4f98a2
YZ
687 * The key offset for the implicit back refs is hash of the first
688 * three fields.
d8d5f3e1 689 *
5d4f98a2 690 * The extent ref structure for the full back refs has field for:
d8d5f3e1 691 *
5d4f98a2 692 * - number of pointers in the tree leaf
d8d5f3e1 693 *
5d4f98a2
YZ
694 * The key offset for the implicit back refs is the first byte of
695 * the tree leaf
d8d5f3e1 696 *
5d4f98a2
YZ
697 * When a file extent is allocated, The implicit back refs is used.
698 * the fields are filled in:
d8d5f3e1 699 *
5d4f98a2 700 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 701 *
5d4f98a2
YZ
702 * When a file extent is removed file truncation, we find the
703 * corresponding implicit back refs and check the following fields:
d8d5f3e1 704 *
5d4f98a2 705 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 706 *
5d4f98a2 707 * Btree extents can be referenced by:
d8d5f3e1 708 *
5d4f98a2 709 * - Different subvolumes
d8d5f3e1 710 *
5d4f98a2
YZ
711 * Both the implicit back refs and the full back refs for tree blocks
712 * only consist of key. The key offset for the implicit back refs is
713 * objectid of block's owner tree. The key offset for the full back refs
714 * is the first byte of parent block.
d8d5f3e1 715 *
5d4f98a2
YZ
716 * When implicit back refs is used, information about the lowest key and
717 * level of the tree block are required. These information are stored in
718 * tree block info structure.
d8d5f3e1 719 */
31840ae1 720
5d4f98a2
YZ
721#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
722static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
723 struct btrfs_root *root,
724 struct btrfs_path *path,
725 u64 owner, u32 extra_size)
7bb86316 726{
5d4f98a2
YZ
727 struct btrfs_extent_item *item;
728 struct btrfs_extent_item_v0 *ei0;
729 struct btrfs_extent_ref_v0 *ref0;
730 struct btrfs_tree_block_info *bi;
731 struct extent_buffer *leaf;
7bb86316 732 struct btrfs_key key;
5d4f98a2
YZ
733 struct btrfs_key found_key;
734 u32 new_size = sizeof(*item);
735 u64 refs;
736 int ret;
737
738 leaf = path->nodes[0];
739 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
740
741 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
742 ei0 = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_extent_item_v0);
744 refs = btrfs_extent_refs_v0(leaf, ei0);
745
746 if (owner == (u64)-1) {
747 while (1) {
748 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
749 ret = btrfs_next_leaf(root, path);
750 if (ret < 0)
751 return ret;
752 BUG_ON(ret > 0);
753 leaf = path->nodes[0];
754 }
755 btrfs_item_key_to_cpu(leaf, &found_key,
756 path->slots[0]);
757 BUG_ON(key.objectid != found_key.objectid);
758 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
759 path->slots[0]++;
760 continue;
761 }
762 ref0 = btrfs_item_ptr(leaf, path->slots[0],
763 struct btrfs_extent_ref_v0);
764 owner = btrfs_ref_objectid_v0(leaf, ref0);
765 break;
766 }
767 }
768 btrfs_release_path(root, path);
769
770 if (owner < BTRFS_FIRST_FREE_OBJECTID)
771 new_size += sizeof(*bi);
772
773 new_size -= sizeof(*ei0);
774 ret = btrfs_search_slot(trans, root, &key, path,
775 new_size + extra_size, 1);
776 if (ret < 0)
777 return ret;
778 BUG_ON(ret);
779
780 ret = btrfs_extend_item(trans, root, path, new_size);
781 BUG_ON(ret);
782
783 leaf = path->nodes[0];
784 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
785 btrfs_set_extent_refs(leaf, item, refs);
786 /* FIXME: get real generation */
787 btrfs_set_extent_generation(leaf, item, 0);
788 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
789 btrfs_set_extent_flags(leaf, item,
790 BTRFS_EXTENT_FLAG_TREE_BLOCK |
791 BTRFS_BLOCK_FLAG_FULL_BACKREF);
792 bi = (struct btrfs_tree_block_info *)(item + 1);
793 /* FIXME: get first key of the block */
794 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
795 btrfs_set_tree_block_level(leaf, bi, (int)owner);
796 } else {
797 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
798 }
799 btrfs_mark_buffer_dirty(leaf);
800 return 0;
801}
802#endif
803
804static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
805{
806 u32 high_crc = ~(u32)0;
807 u32 low_crc = ~(u32)0;
808 __le64 lenum;
809
810 lenum = cpu_to_le64(root_objectid);
163e783e 811 high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 812 lenum = cpu_to_le64(owner);
163e783e 813 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 814 lenum = cpu_to_le64(offset);
163e783e 815 low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
816
817 return ((u64)high_crc << 31) ^ (u64)low_crc;
818}
819
820static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
821 struct btrfs_extent_data_ref *ref)
822{
823 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
824 btrfs_extent_data_ref_objectid(leaf, ref),
825 btrfs_extent_data_ref_offset(leaf, ref));
826}
827
828static int match_extent_data_ref(struct extent_buffer *leaf,
829 struct btrfs_extent_data_ref *ref,
830 u64 root_objectid, u64 owner, u64 offset)
831{
832 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
833 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
834 btrfs_extent_data_ref_offset(leaf, ref) != offset)
835 return 0;
836 return 1;
837}
838
839static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
840 struct btrfs_root *root,
841 struct btrfs_path *path,
842 u64 bytenr, u64 parent,
843 u64 root_objectid,
844 u64 owner, u64 offset)
845{
846 struct btrfs_key key;
847 struct btrfs_extent_data_ref *ref;
31840ae1 848 struct extent_buffer *leaf;
5d4f98a2 849 u32 nritems;
74493f7a 850 int ret;
5d4f98a2
YZ
851 int recow;
852 int err = -ENOENT;
74493f7a 853
31840ae1 854 key.objectid = bytenr;
5d4f98a2
YZ
855 if (parent) {
856 key.type = BTRFS_SHARED_DATA_REF_KEY;
857 key.offset = parent;
858 } else {
859 key.type = BTRFS_EXTENT_DATA_REF_KEY;
860 key.offset = hash_extent_data_ref(root_objectid,
861 owner, offset);
862 }
863again:
864 recow = 0;
865 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
866 if (ret < 0) {
867 err = ret;
868 goto fail;
869 }
31840ae1 870
5d4f98a2
YZ
871 if (parent) {
872 if (!ret)
873 return 0;
874#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
875 key.type = BTRFS_EXTENT_REF_V0_KEY;
876 btrfs_release_path(root, path);
877 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
878 if (ret < 0) {
879 err = ret;
880 goto fail;
881 }
882 if (!ret)
883 return 0;
884#endif
885 goto fail;
31840ae1
ZY
886 }
887
888 leaf = path->nodes[0];
5d4f98a2
YZ
889 nritems = btrfs_header_nritems(leaf);
890 while (1) {
891 if (path->slots[0] >= nritems) {
892 ret = btrfs_next_leaf(root, path);
893 if (ret < 0)
894 err = ret;
895 if (ret)
896 goto fail;
897
898 leaf = path->nodes[0];
899 nritems = btrfs_header_nritems(leaf);
900 recow = 1;
901 }
902
903 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
904 if (key.objectid != bytenr ||
905 key.type != BTRFS_EXTENT_DATA_REF_KEY)
906 goto fail;
907
908 ref = btrfs_item_ptr(leaf, path->slots[0],
909 struct btrfs_extent_data_ref);
910
911 if (match_extent_data_ref(leaf, ref, root_objectid,
912 owner, offset)) {
913 if (recow) {
914 btrfs_release_path(root, path);
915 goto again;
916 }
917 err = 0;
918 break;
919 }
920 path->slots[0]++;
31840ae1 921 }
5d4f98a2
YZ
922fail:
923 return err;
31840ae1
ZY
924}
925
5d4f98a2
YZ
926static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
927 struct btrfs_root *root,
928 struct btrfs_path *path,
929 u64 bytenr, u64 parent,
930 u64 root_objectid, u64 owner,
931 u64 offset, int refs_to_add)
31840ae1
ZY
932{
933 struct btrfs_key key;
934 struct extent_buffer *leaf;
5d4f98a2 935 u32 size;
31840ae1
ZY
936 u32 num_refs;
937 int ret;
74493f7a 938
74493f7a 939 key.objectid = bytenr;
5d4f98a2
YZ
940 if (parent) {
941 key.type = BTRFS_SHARED_DATA_REF_KEY;
942 key.offset = parent;
943 size = sizeof(struct btrfs_shared_data_ref);
944 } else {
945 key.type = BTRFS_EXTENT_DATA_REF_KEY;
946 key.offset = hash_extent_data_ref(root_objectid,
947 owner, offset);
948 size = sizeof(struct btrfs_extent_data_ref);
949 }
74493f7a 950
5d4f98a2
YZ
951 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
952 if (ret && ret != -EEXIST)
953 goto fail;
954
955 leaf = path->nodes[0];
956 if (parent) {
957 struct btrfs_shared_data_ref *ref;
31840ae1 958 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
959 struct btrfs_shared_data_ref);
960 if (ret == 0) {
961 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
962 } else {
963 num_refs = btrfs_shared_data_ref_count(leaf, ref);
964 num_refs += refs_to_add;
965 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 966 }
5d4f98a2
YZ
967 } else {
968 struct btrfs_extent_data_ref *ref;
969 while (ret == -EEXIST) {
970 ref = btrfs_item_ptr(leaf, path->slots[0],
971 struct btrfs_extent_data_ref);
972 if (match_extent_data_ref(leaf, ref, root_objectid,
973 owner, offset))
974 break;
975 btrfs_release_path(root, path);
976 key.offset++;
977 ret = btrfs_insert_empty_item(trans, root, path, &key,
978 size);
979 if (ret && ret != -EEXIST)
980 goto fail;
31840ae1 981
5d4f98a2
YZ
982 leaf = path->nodes[0];
983 }
984 ref = btrfs_item_ptr(leaf, path->slots[0],
985 struct btrfs_extent_data_ref);
986 if (ret == 0) {
987 btrfs_set_extent_data_ref_root(leaf, ref,
988 root_objectid);
989 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
990 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
991 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
992 } else {
993 num_refs = btrfs_extent_data_ref_count(leaf, ref);
994 num_refs += refs_to_add;
995 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 996 }
31840ae1 997 }
5d4f98a2
YZ
998 btrfs_mark_buffer_dirty(leaf);
999 ret = 0;
1000fail:
7bb86316
CM
1001 btrfs_release_path(root, path);
1002 return ret;
74493f7a
CM
1003}
1004
5d4f98a2
YZ
1005static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1006 struct btrfs_root *root,
1007 struct btrfs_path *path,
1008 int refs_to_drop)
31840ae1 1009{
5d4f98a2
YZ
1010 struct btrfs_key key;
1011 struct btrfs_extent_data_ref *ref1 = NULL;
1012 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 1013 struct extent_buffer *leaf;
5d4f98a2 1014 u32 num_refs = 0;
31840ae1
ZY
1015 int ret = 0;
1016
1017 leaf = path->nodes[0];
5d4f98a2
YZ
1018 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1019
1020 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1021 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1022 struct btrfs_extent_data_ref);
1023 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1024 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1025 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1026 struct btrfs_shared_data_ref);
1027 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1028#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1029 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1030 struct btrfs_extent_ref_v0 *ref0;
1031 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1032 struct btrfs_extent_ref_v0);
1033 num_refs = btrfs_ref_count_v0(leaf, ref0);
1034#endif
1035 } else {
1036 BUG();
1037 }
1038
56bec294
CM
1039 BUG_ON(num_refs < refs_to_drop);
1040 num_refs -= refs_to_drop;
5d4f98a2 1041
31840ae1
ZY
1042 if (num_refs == 0) {
1043 ret = btrfs_del_item(trans, root, path);
1044 } else {
5d4f98a2
YZ
1045 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1046 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1047 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1048 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1049#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1050 else {
1051 struct btrfs_extent_ref_v0 *ref0;
1052 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1053 struct btrfs_extent_ref_v0);
1054 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1055 }
1056#endif
31840ae1
ZY
1057 btrfs_mark_buffer_dirty(leaf);
1058 }
31840ae1
ZY
1059 return ret;
1060}
1061
5d4f98a2
YZ
1062static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1063 struct btrfs_path *path,
1064 struct btrfs_extent_inline_ref *iref)
15916de8 1065{
5d4f98a2
YZ
1066 struct btrfs_key key;
1067 struct extent_buffer *leaf;
1068 struct btrfs_extent_data_ref *ref1;
1069 struct btrfs_shared_data_ref *ref2;
1070 u32 num_refs = 0;
1071
1072 leaf = path->nodes[0];
1073 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1074 if (iref) {
1075 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1076 BTRFS_EXTENT_DATA_REF_KEY) {
1077 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1078 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1079 } else {
1080 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1081 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1082 }
1083 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1084 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1085 struct btrfs_extent_data_ref);
1086 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1087 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1088 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1089 struct btrfs_shared_data_ref);
1090 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1091#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1092 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1093 struct btrfs_extent_ref_v0 *ref0;
1094 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1095 struct btrfs_extent_ref_v0);
1096 num_refs = btrfs_ref_count_v0(leaf, ref0);
4b4e25f2 1097#endif
5d4f98a2
YZ
1098 } else {
1099 WARN_ON(1);
1100 }
1101 return num_refs;
1102}
15916de8 1103
5d4f98a2
YZ
1104static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1105 struct btrfs_root *root,
1106 struct btrfs_path *path,
1107 u64 bytenr, u64 parent,
1108 u64 root_objectid)
1f3c79a2 1109{
5d4f98a2 1110 struct btrfs_key key;
1f3c79a2 1111 int ret;
1f3c79a2 1112
5d4f98a2
YZ
1113 key.objectid = bytenr;
1114 if (parent) {
1115 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1116 key.offset = parent;
1117 } else {
1118 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1119 key.offset = root_objectid;
1f3c79a2
LH
1120 }
1121
5d4f98a2
YZ
1122 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1123 if (ret > 0)
1124 ret = -ENOENT;
1125#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1126 if (ret == -ENOENT && parent) {
1127 btrfs_release_path(root, path);
1128 key.type = BTRFS_EXTENT_REF_V0_KEY;
1129 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1130 if (ret > 0)
1131 ret = -ENOENT;
1132 }
1f3c79a2 1133#endif
5d4f98a2 1134 return ret;
1f3c79a2
LH
1135}
1136
5d4f98a2
YZ
1137static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1138 struct btrfs_root *root,
1139 struct btrfs_path *path,
1140 u64 bytenr, u64 parent,
1141 u64 root_objectid)
31840ae1 1142{
5d4f98a2 1143 struct btrfs_key key;
31840ae1 1144 int ret;
31840ae1 1145
5d4f98a2
YZ
1146 key.objectid = bytenr;
1147 if (parent) {
1148 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1149 key.offset = parent;
1150 } else {
1151 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1152 key.offset = root_objectid;
1153 }
1154
1155 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1156 btrfs_release_path(root, path);
31840ae1
ZY
1157 return ret;
1158}
1159
5d4f98a2 1160static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 1161{
5d4f98a2
YZ
1162 int type;
1163 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1164 if (parent > 0)
1165 type = BTRFS_SHARED_BLOCK_REF_KEY;
1166 else
1167 type = BTRFS_TREE_BLOCK_REF_KEY;
1168 } else {
1169 if (parent > 0)
1170 type = BTRFS_SHARED_DATA_REF_KEY;
1171 else
1172 type = BTRFS_EXTENT_DATA_REF_KEY;
1173 }
1174 return type;
31840ae1 1175}
56bec294 1176
2c47e605
YZ
1177static int find_next_key(struct btrfs_path *path, int level,
1178 struct btrfs_key *key)
56bec294 1179
02217ed2 1180{
2c47e605 1181 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
1182 if (!path->nodes[level])
1183 break;
5d4f98a2
YZ
1184 if (path->slots[level] + 1 >=
1185 btrfs_header_nritems(path->nodes[level]))
1186 continue;
1187 if (level == 0)
1188 btrfs_item_key_to_cpu(path->nodes[level], key,
1189 path->slots[level] + 1);
1190 else
1191 btrfs_node_key_to_cpu(path->nodes[level], key,
1192 path->slots[level] + 1);
1193 return 0;
1194 }
1195 return 1;
1196}
037e6390 1197
5d4f98a2
YZ
1198/*
1199 * look for inline back ref. if back ref is found, *ref_ret is set
1200 * to the address of inline back ref, and 0 is returned.
1201 *
1202 * if back ref isn't found, *ref_ret is set to the address where it
1203 * should be inserted, and -ENOENT is returned.
1204 *
1205 * if insert is true and there are too many inline back refs, the path
1206 * points to the extent item, and -EAGAIN is returned.
1207 *
1208 * NOTE: inline back refs are ordered in the same way that back ref
1209 * items in the tree are ordered.
1210 */
1211static noinline_for_stack
1212int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1213 struct btrfs_root *root,
1214 struct btrfs_path *path,
1215 struct btrfs_extent_inline_ref **ref_ret,
1216 u64 bytenr, u64 num_bytes,
1217 u64 parent, u64 root_objectid,
1218 u64 owner, u64 offset, int insert)
1219{
1220 struct btrfs_key key;
1221 struct extent_buffer *leaf;
1222 struct btrfs_extent_item *ei;
1223 struct btrfs_extent_inline_ref *iref;
1224 u64 flags;
1225 u64 item_size;
1226 unsigned long ptr;
1227 unsigned long end;
1228 int extra_size;
1229 int type;
1230 int want;
1231 int ret;
1232 int err = 0;
26b8003f 1233
db94535d 1234 key.objectid = bytenr;
31840ae1 1235 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 1236 key.offset = num_bytes;
31840ae1 1237
5d4f98a2
YZ
1238 want = extent_ref_type(parent, owner);
1239 if (insert) {
1240 extra_size = btrfs_extent_inline_ref_size(want);
85d4198e 1241 path->keep_locks = 1;
5d4f98a2
YZ
1242 } else
1243 extra_size = -1;
1244 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 1245 if (ret < 0) {
5d4f98a2
YZ
1246 err = ret;
1247 goto out;
1248 }
1249 BUG_ON(ret);
1250
1251 leaf = path->nodes[0];
1252 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1253#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1254 if (item_size < sizeof(*ei)) {
1255 if (!insert) {
1256 err = -ENOENT;
1257 goto out;
1258 }
1259 ret = convert_extent_item_v0(trans, root, path, owner,
1260 extra_size);
1261 if (ret < 0) {
1262 err = ret;
1263 goto out;
1264 }
1265 leaf = path->nodes[0];
1266 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1267 }
1268#endif
1269 BUG_ON(item_size < sizeof(*ei));
1270
5d4f98a2
YZ
1271 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1272 flags = btrfs_extent_flags(leaf, ei);
1273
1274 ptr = (unsigned long)(ei + 1);
1275 end = (unsigned long)ei + item_size;
1276
1277 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1278 ptr += sizeof(struct btrfs_tree_block_info);
1279 BUG_ON(ptr > end);
1280 } else {
1281 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1282 }
1283
1284 err = -ENOENT;
1285 while (1) {
1286 if (ptr >= end) {
1287 WARN_ON(ptr > end);
1288 break;
1289 }
1290 iref = (struct btrfs_extent_inline_ref *)ptr;
1291 type = btrfs_extent_inline_ref_type(leaf, iref);
1292 if (want < type)
1293 break;
1294 if (want > type) {
1295 ptr += btrfs_extent_inline_ref_size(type);
1296 continue;
1297 }
1298
1299 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1300 struct btrfs_extent_data_ref *dref;
1301 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1302 if (match_extent_data_ref(leaf, dref, root_objectid,
1303 owner, offset)) {
1304 err = 0;
1305 break;
1306 }
1307 if (hash_extent_data_ref_item(leaf, dref) <
1308 hash_extent_data_ref(root_objectid, owner, offset))
1309 break;
1310 } else {
1311 u64 ref_offset;
1312 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1313 if (parent > 0) {
1314 if (parent == ref_offset) {
1315 err = 0;
1316 break;
1317 }
1318 if (ref_offset < parent)
1319 break;
1320 } else {
1321 if (root_objectid == ref_offset) {
1322 err = 0;
1323 break;
1324 }
1325 if (ref_offset < root_objectid)
1326 break;
1327 }
1328 }
1329 ptr += btrfs_extent_inline_ref_size(type);
1330 }
1331 if (err == -ENOENT && insert) {
1332 if (item_size + extra_size >=
1333 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1334 err = -EAGAIN;
1335 goto out;
1336 }
1337 /*
1338 * To add new inline back ref, we have to make sure
1339 * there is no corresponding back ref item.
1340 * For simplicity, we just do not add new inline back
1341 * ref if there is any kind of item for this block
1342 */
2c47e605
YZ
1343 if (find_next_key(path, 0, &key) == 0 &&
1344 key.objectid == bytenr &&
85d4198e 1345 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
1346 err = -EAGAIN;
1347 goto out;
1348 }
1349 }
1350 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1351out:
85d4198e 1352 if (insert) {
5d4f98a2
YZ
1353 path->keep_locks = 0;
1354 btrfs_unlock_up_safe(path, 1);
1355 }
1356 return err;
1357}
1358
1359/*
1360 * helper to add new inline back ref
1361 */
1362static noinline_for_stack
1363int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1364 struct btrfs_root *root,
1365 struct btrfs_path *path,
1366 struct btrfs_extent_inline_ref *iref,
1367 u64 parent, u64 root_objectid,
1368 u64 owner, u64 offset, int refs_to_add,
1369 struct btrfs_delayed_extent_op *extent_op)
1370{
1371 struct extent_buffer *leaf;
1372 struct btrfs_extent_item *ei;
1373 unsigned long ptr;
1374 unsigned long end;
1375 unsigned long item_offset;
1376 u64 refs;
1377 int size;
1378 int type;
1379 int ret;
1380
1381 leaf = path->nodes[0];
1382 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1383 item_offset = (unsigned long)iref - (unsigned long)ei;
1384
1385 type = extent_ref_type(parent, owner);
1386 size = btrfs_extent_inline_ref_size(type);
1387
1388 ret = btrfs_extend_item(trans, root, path, size);
1389 BUG_ON(ret);
1390
1391 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1392 refs = btrfs_extent_refs(leaf, ei);
1393 refs += refs_to_add;
1394 btrfs_set_extent_refs(leaf, ei, refs);
1395 if (extent_op)
1396 __run_delayed_extent_op(extent_op, leaf, ei);
1397
1398 ptr = (unsigned long)ei + item_offset;
1399 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1400 if (ptr < end - size)
1401 memmove_extent_buffer(leaf, ptr + size, ptr,
1402 end - size - ptr);
1403
1404 iref = (struct btrfs_extent_inline_ref *)ptr;
1405 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1406 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1407 struct btrfs_extent_data_ref *dref;
1408 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1409 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1410 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1411 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1412 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1413 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1414 struct btrfs_shared_data_ref *sref;
1415 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1416 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1417 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1418 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1419 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1420 } else {
1421 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1422 }
1423 btrfs_mark_buffer_dirty(leaf);
1424 return 0;
1425}
1426
1427static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1428 struct btrfs_root *root,
1429 struct btrfs_path *path,
1430 struct btrfs_extent_inline_ref **ref_ret,
1431 u64 bytenr, u64 num_bytes, u64 parent,
1432 u64 root_objectid, u64 owner, u64 offset)
1433{
1434 int ret;
1435
1436 ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1437 bytenr, num_bytes, parent,
1438 root_objectid, owner, offset, 0);
1439 if (ret != -ENOENT)
54aa1f4d 1440 return ret;
5d4f98a2
YZ
1441
1442 btrfs_release_path(root, path);
1443 *ref_ret = NULL;
1444
1445 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1446 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1447 root_objectid);
1448 } else {
1449 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1450 root_objectid, owner, offset);
b9473439 1451 }
5d4f98a2
YZ
1452 return ret;
1453}
31840ae1 1454
5d4f98a2
YZ
1455/*
1456 * helper to update/remove inline back ref
1457 */
1458static noinline_for_stack
1459int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1460 struct btrfs_root *root,
1461 struct btrfs_path *path,
1462 struct btrfs_extent_inline_ref *iref,
1463 int refs_to_mod,
1464 struct btrfs_delayed_extent_op *extent_op)
1465{
1466 struct extent_buffer *leaf;
1467 struct btrfs_extent_item *ei;
1468 struct btrfs_extent_data_ref *dref = NULL;
1469 struct btrfs_shared_data_ref *sref = NULL;
1470 unsigned long ptr;
1471 unsigned long end;
1472 u32 item_size;
1473 int size;
1474 int type;
1475 int ret;
1476 u64 refs;
1477
1478 leaf = path->nodes[0];
1479 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1480 refs = btrfs_extent_refs(leaf, ei);
1481 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1482 refs += refs_to_mod;
1483 btrfs_set_extent_refs(leaf, ei, refs);
1484 if (extent_op)
1485 __run_delayed_extent_op(extent_op, leaf, ei);
1486
1487 type = btrfs_extent_inline_ref_type(leaf, iref);
1488
1489 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1490 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1491 refs = btrfs_extent_data_ref_count(leaf, dref);
1492 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1493 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1494 refs = btrfs_shared_data_ref_count(leaf, sref);
1495 } else {
1496 refs = 1;
1497 BUG_ON(refs_to_mod != -1);
56bec294 1498 }
31840ae1 1499
5d4f98a2
YZ
1500 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1501 refs += refs_to_mod;
1502
1503 if (refs > 0) {
1504 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1505 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1506 else
1507 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1508 } else {
1509 size = btrfs_extent_inline_ref_size(type);
1510 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1511 ptr = (unsigned long)iref;
1512 end = (unsigned long)ei + item_size;
1513 if (ptr + size < end)
1514 memmove_extent_buffer(leaf, ptr, ptr + size,
1515 end - ptr - size);
1516 item_size -= size;
1517 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1518 BUG_ON(ret);
1519 }
1520 btrfs_mark_buffer_dirty(leaf);
1521 return 0;
1522}
1523
1524static noinline_for_stack
1525int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1526 struct btrfs_root *root,
1527 struct btrfs_path *path,
1528 u64 bytenr, u64 num_bytes, u64 parent,
1529 u64 root_objectid, u64 owner,
1530 u64 offset, int refs_to_add,
1531 struct btrfs_delayed_extent_op *extent_op)
1532{
1533 struct btrfs_extent_inline_ref *iref;
1534 int ret;
1535
1536 ret = lookup_inline_extent_backref(trans, root, path, &iref,
1537 bytenr, num_bytes, parent,
1538 root_objectid, owner, offset, 1);
1539 if (ret == 0) {
1540 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1541 ret = update_inline_extent_backref(trans, root, path, iref,
1542 refs_to_add, extent_op);
1543 } else if (ret == -ENOENT) {
1544 ret = setup_inline_extent_backref(trans, root, path, iref,
1545 parent, root_objectid,
1546 owner, offset, refs_to_add,
1547 extent_op);
771ed689 1548 }
5d4f98a2
YZ
1549 return ret;
1550}
31840ae1 1551
5d4f98a2
YZ
1552static int insert_extent_backref(struct btrfs_trans_handle *trans,
1553 struct btrfs_root *root,
1554 struct btrfs_path *path,
1555 u64 bytenr, u64 parent, u64 root_objectid,
1556 u64 owner, u64 offset, int refs_to_add)
1557{
1558 int ret;
1559 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1560 BUG_ON(refs_to_add != 1);
1561 ret = insert_tree_block_ref(trans, root, path, bytenr,
1562 parent, root_objectid);
1563 } else {
1564 ret = insert_extent_data_ref(trans, root, path, bytenr,
1565 parent, root_objectid,
1566 owner, offset, refs_to_add);
1567 }
1568 return ret;
1569}
56bec294 1570
5d4f98a2
YZ
1571static int remove_extent_backref(struct btrfs_trans_handle *trans,
1572 struct btrfs_root *root,
1573 struct btrfs_path *path,
1574 struct btrfs_extent_inline_ref *iref,
1575 int refs_to_drop, int is_data)
1576{
1577 int ret;
b9473439 1578
5d4f98a2
YZ
1579 BUG_ON(!is_data && refs_to_drop != 1);
1580 if (iref) {
1581 ret = update_inline_extent_backref(trans, root, path, iref,
1582 -refs_to_drop, NULL);
1583 } else if (is_data) {
1584 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1585 } else {
1586 ret = btrfs_del_item(trans, root, path);
1587 }
1588 return ret;
1589}
1590
5d4f98a2
YZ
1591static void btrfs_issue_discard(struct block_device *bdev,
1592 u64 start, u64 len)
1593{
746cd1e7
CH
1594 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1595 DISCARD_FL_BARRIER);
5d4f98a2 1596}
5d4f98a2
YZ
1597
1598static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1599 u64 num_bytes)
1600{
5d4f98a2
YZ
1601 int ret;
1602 u64 map_length = num_bytes;
1603 struct btrfs_multi_bio *multi = NULL;
1604
e244a0ae
CH
1605 if (!btrfs_test_opt(root, DISCARD))
1606 return 0;
1607
5d4f98a2
YZ
1608 /* Tell the block device(s) that the sectors can be discarded */
1609 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1610 bytenr, &map_length, &multi, 0);
1611 if (!ret) {
1612 struct btrfs_bio_stripe *stripe = multi->stripes;
1613 int i;
1614
1615 if (map_length > num_bytes)
1616 map_length = num_bytes;
1617
1618 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1619 btrfs_issue_discard(stripe->dev->bdev,
1620 stripe->physical,
1621 map_length);
1622 }
1623 kfree(multi);
1624 }
1625
1626 return ret;
5d4f98a2
YZ
1627}
1628
1629int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1630 struct btrfs_root *root,
1631 u64 bytenr, u64 num_bytes, u64 parent,
1632 u64 root_objectid, u64 owner, u64 offset)
1633{
1634 int ret;
1635 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1636 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1637
1638 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1639 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1640 parent, root_objectid, (int)owner,
1641 BTRFS_ADD_DELAYED_REF, NULL);
1642 } else {
1643 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1644 parent, root_objectid, owner, offset,
1645 BTRFS_ADD_DELAYED_REF, NULL);
1646 }
1647 return ret;
1648}
1649
1650static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1651 struct btrfs_root *root,
1652 u64 bytenr, u64 num_bytes,
1653 u64 parent, u64 root_objectid,
1654 u64 owner, u64 offset, int refs_to_add,
1655 struct btrfs_delayed_extent_op *extent_op)
1656{
1657 struct btrfs_path *path;
1658 struct extent_buffer *leaf;
1659 struct btrfs_extent_item *item;
1660 u64 refs;
1661 int ret;
1662 int err = 0;
1663
1664 path = btrfs_alloc_path();
1665 if (!path)
1666 return -ENOMEM;
1667
1668 path->reada = 1;
1669 path->leave_spinning = 1;
1670 /* this will setup the path even if it fails to insert the back ref */
1671 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1672 path, bytenr, num_bytes, parent,
1673 root_objectid, owner, offset,
1674 refs_to_add, extent_op);
1675 if (ret == 0)
1676 goto out;
1677
1678 if (ret != -EAGAIN) {
1679 err = ret;
1680 goto out;
1681 }
1682
1683 leaf = path->nodes[0];
1684 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1685 refs = btrfs_extent_refs(leaf, item);
1686 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1687 if (extent_op)
1688 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 1689
5d4f98a2 1690 btrfs_mark_buffer_dirty(leaf);
56bec294
CM
1691 btrfs_release_path(root->fs_info->extent_root, path);
1692
1693 path->reada = 1;
b9473439
CM
1694 path->leave_spinning = 1;
1695
56bec294
CM
1696 /* now insert the actual backref */
1697 ret = insert_extent_backref(trans, root->fs_info->extent_root,
5d4f98a2
YZ
1698 path, bytenr, parent, root_objectid,
1699 owner, offset, refs_to_add);
56bec294 1700 BUG_ON(ret);
5d4f98a2 1701out:
56bec294 1702 btrfs_free_path(path);
5d4f98a2 1703 return err;
56bec294
CM
1704}
1705
5d4f98a2
YZ
1706static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root,
1708 struct btrfs_delayed_ref_node *node,
1709 struct btrfs_delayed_extent_op *extent_op,
1710 int insert_reserved)
56bec294 1711{
5d4f98a2
YZ
1712 int ret = 0;
1713 struct btrfs_delayed_data_ref *ref;
1714 struct btrfs_key ins;
1715 u64 parent = 0;
1716 u64 ref_root = 0;
1717 u64 flags = 0;
1718
1719 ins.objectid = node->bytenr;
1720 ins.offset = node->num_bytes;
1721 ins.type = BTRFS_EXTENT_ITEM_KEY;
1722
1723 ref = btrfs_delayed_node_to_data_ref(node);
1724 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1725 parent = ref->parent;
1726 else
1727 ref_root = ref->root;
1728
1729 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1730 if (extent_op) {
1731 BUG_ON(extent_op->update_key);
1732 flags |= extent_op->flags_to_set;
1733 }
1734 ret = alloc_reserved_file_extent(trans, root,
1735 parent, ref_root, flags,
1736 ref->objectid, ref->offset,
1737 &ins, node->ref_mod);
5d4f98a2
YZ
1738 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1739 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1740 node->num_bytes, parent,
1741 ref_root, ref->objectid,
1742 ref->offset, node->ref_mod,
1743 extent_op);
1744 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1745 ret = __btrfs_free_extent(trans, root, node->bytenr,
1746 node->num_bytes, parent,
1747 ref_root, ref->objectid,
1748 ref->offset, node->ref_mod,
1749 extent_op);
1750 } else {
1751 BUG();
1752 }
1753 return ret;
1754}
1755
1756static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1757 struct extent_buffer *leaf,
1758 struct btrfs_extent_item *ei)
1759{
1760 u64 flags = btrfs_extent_flags(leaf, ei);
1761 if (extent_op->update_flags) {
1762 flags |= extent_op->flags_to_set;
1763 btrfs_set_extent_flags(leaf, ei, flags);
1764 }
1765
1766 if (extent_op->update_key) {
1767 struct btrfs_tree_block_info *bi;
1768 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1769 bi = (struct btrfs_tree_block_info *)(ei + 1);
1770 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1771 }
1772}
1773
1774static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1775 struct btrfs_root *root,
1776 struct btrfs_delayed_ref_node *node,
1777 struct btrfs_delayed_extent_op *extent_op)
1778{
1779 struct btrfs_key key;
1780 struct btrfs_path *path;
1781 struct btrfs_extent_item *ei;
1782 struct extent_buffer *leaf;
1783 u32 item_size;
56bec294 1784 int ret;
5d4f98a2
YZ
1785 int err = 0;
1786
1787 path = btrfs_alloc_path();
1788 if (!path)
1789 return -ENOMEM;
1790
1791 key.objectid = node->bytenr;
1792 key.type = BTRFS_EXTENT_ITEM_KEY;
1793 key.offset = node->num_bytes;
1794
1795 path->reada = 1;
1796 path->leave_spinning = 1;
1797 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1798 path, 0, 1);
1799 if (ret < 0) {
1800 err = ret;
1801 goto out;
1802 }
1803 if (ret > 0) {
1804 err = -EIO;
1805 goto out;
1806 }
1807
1808 leaf = path->nodes[0];
1809 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1810#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1811 if (item_size < sizeof(*ei)) {
1812 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1813 path, (u64)-1, 0);
1814 if (ret < 0) {
1815 err = ret;
1816 goto out;
1817 }
1818 leaf = path->nodes[0];
1819 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1820 }
1821#endif
1822 BUG_ON(item_size < sizeof(*ei));
1823 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1824 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 1825
5d4f98a2
YZ
1826 btrfs_mark_buffer_dirty(leaf);
1827out:
1828 btrfs_free_path(path);
1829 return err;
56bec294
CM
1830}
1831
5d4f98a2
YZ
1832static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1833 struct btrfs_root *root,
1834 struct btrfs_delayed_ref_node *node,
1835 struct btrfs_delayed_extent_op *extent_op,
1836 int insert_reserved)
56bec294
CM
1837{
1838 int ret = 0;
5d4f98a2
YZ
1839 struct btrfs_delayed_tree_ref *ref;
1840 struct btrfs_key ins;
1841 u64 parent = 0;
1842 u64 ref_root = 0;
56bec294 1843
5d4f98a2
YZ
1844 ins.objectid = node->bytenr;
1845 ins.offset = node->num_bytes;
1846 ins.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 1847
5d4f98a2
YZ
1848 ref = btrfs_delayed_node_to_tree_ref(node);
1849 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1850 parent = ref->parent;
1851 else
1852 ref_root = ref->root;
1853
1854 BUG_ON(node->ref_mod != 1);
1855 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1856 BUG_ON(!extent_op || !extent_op->update_flags ||
1857 !extent_op->update_key);
1858 ret = alloc_reserved_tree_block(trans, root,
1859 parent, ref_root,
1860 extent_op->flags_to_set,
1861 &extent_op->key,
1862 ref->level, &ins);
5d4f98a2
YZ
1863 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1864 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1865 node->num_bytes, parent, ref_root,
1866 ref->level, 0, 1, extent_op);
1867 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1868 ret = __btrfs_free_extent(trans, root, node->bytenr,
1869 node->num_bytes, parent, ref_root,
1870 ref->level, 0, 1, extent_op);
1871 } else {
1872 BUG();
1873 }
56bec294
CM
1874 return ret;
1875}
1876
5d4f98a2 1877
56bec294 1878/* helper function to actually process a single delayed ref entry */
5d4f98a2
YZ
1879static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1880 struct btrfs_root *root,
1881 struct btrfs_delayed_ref_node *node,
1882 struct btrfs_delayed_extent_op *extent_op,
1883 int insert_reserved)
56bec294
CM
1884{
1885 int ret;
5d4f98a2 1886 if (btrfs_delayed_ref_is_head(node)) {
56bec294
CM
1887 struct btrfs_delayed_ref_head *head;
1888 /*
1889 * we've hit the end of the chain and we were supposed
1890 * to insert this extent into the tree. But, it got
1891 * deleted before we ever needed to insert it, so all
1892 * we have to do is clean up the accounting
1893 */
5d4f98a2
YZ
1894 BUG_ON(extent_op);
1895 head = btrfs_delayed_node_to_head(node);
56bec294 1896 if (insert_reserved) {
11833d66
YZ
1897 int mark_free = 0;
1898 struct extent_buffer *must_clean = NULL;
1899
1900 ret = pin_down_bytes(trans, root, NULL,
1901 node->bytenr, node->num_bytes,
1902 head->is_data, 1, &must_clean);
1903 if (ret > 0)
1904 mark_free = 1;
1905
1906 if (must_clean) {
1907 clean_tree_block(NULL, root, must_clean);
1908 btrfs_tree_unlock(must_clean);
1909 free_extent_buffer(must_clean);
1910 }
5d4f98a2
YZ
1911 if (head->is_data) {
1912 ret = btrfs_del_csums(trans, root,
1913 node->bytenr,
1914 node->num_bytes);
1915 BUG_ON(ret);
1916 }
11833d66
YZ
1917 if (mark_free) {
1918 ret = btrfs_free_reserved_extent(root,
1919 node->bytenr,
1920 node->num_bytes);
1921 BUG_ON(ret);
1922 }
56bec294 1923 }
56bec294
CM
1924 mutex_unlock(&head->mutex);
1925 return 0;
1926 }
1927
5d4f98a2
YZ
1928 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1929 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1930 ret = run_delayed_tree_ref(trans, root, node, extent_op,
1931 insert_reserved);
1932 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1933 node->type == BTRFS_SHARED_DATA_REF_KEY)
1934 ret = run_delayed_data_ref(trans, root, node, extent_op,
1935 insert_reserved);
1936 else
1937 BUG();
1938 return ret;
56bec294
CM
1939}
1940
1941static noinline struct btrfs_delayed_ref_node *
1942select_delayed_ref(struct btrfs_delayed_ref_head *head)
1943{
1944 struct rb_node *node;
1945 struct btrfs_delayed_ref_node *ref;
1946 int action = BTRFS_ADD_DELAYED_REF;
1947again:
1948 /*
1949 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1950 * this prevents ref count from going down to zero when
1951 * there still are pending delayed ref.
1952 */
1953 node = rb_prev(&head->node.rb_node);
1954 while (1) {
1955 if (!node)
1956 break;
1957 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1958 rb_node);
1959 if (ref->bytenr != head->node.bytenr)
1960 break;
5d4f98a2 1961 if (ref->action == action)
56bec294
CM
1962 return ref;
1963 node = rb_prev(node);
1964 }
1965 if (action == BTRFS_ADD_DELAYED_REF) {
1966 action = BTRFS_DROP_DELAYED_REF;
1967 goto again;
1968 }
1969 return NULL;
1970}
1971
c3e69d58
CM
1972static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1973 struct btrfs_root *root,
1974 struct list_head *cluster)
56bec294 1975{
56bec294
CM
1976 struct btrfs_delayed_ref_root *delayed_refs;
1977 struct btrfs_delayed_ref_node *ref;
1978 struct btrfs_delayed_ref_head *locked_ref = NULL;
5d4f98a2 1979 struct btrfs_delayed_extent_op *extent_op;
56bec294 1980 int ret;
c3e69d58 1981 int count = 0;
56bec294 1982 int must_insert_reserved = 0;
56bec294
CM
1983
1984 delayed_refs = &trans->transaction->delayed_refs;
56bec294
CM
1985 while (1) {
1986 if (!locked_ref) {
c3e69d58
CM
1987 /* pick a new head ref from the cluster list */
1988 if (list_empty(cluster))
56bec294 1989 break;
56bec294 1990
c3e69d58
CM
1991 locked_ref = list_entry(cluster->next,
1992 struct btrfs_delayed_ref_head, cluster);
1993
1994 /* grab the lock that says we are going to process
1995 * all the refs for this head */
1996 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1997
1998 /*
1999 * we may have dropped the spin lock to get the head
2000 * mutex lock, and that might have given someone else
2001 * time to free the head. If that's true, it has been
2002 * removed from our list and we can move on.
2003 */
2004 if (ret == -EAGAIN) {
2005 locked_ref = NULL;
2006 count++;
2007 continue;
56bec294
CM
2008 }
2009 }
a28ec197 2010
56bec294
CM
2011 /*
2012 * record the must insert reserved flag before we
2013 * drop the spin lock.
2014 */
2015 must_insert_reserved = locked_ref->must_insert_reserved;
2016 locked_ref->must_insert_reserved = 0;
7bb86316 2017
5d4f98a2
YZ
2018 extent_op = locked_ref->extent_op;
2019 locked_ref->extent_op = NULL;
2020
56bec294
CM
2021 /*
2022 * locked_ref is the head node, so we have to go one
2023 * node back for any delayed ref updates
2024 */
56bec294
CM
2025 ref = select_delayed_ref(locked_ref);
2026 if (!ref) {
2027 /* All delayed refs have been processed, Go ahead
2028 * and send the head node to run_one_delayed_ref,
2029 * so that any accounting fixes can happen
2030 */
2031 ref = &locked_ref->node;
5d4f98a2
YZ
2032
2033 if (extent_op && must_insert_reserved) {
2034 kfree(extent_op);
2035 extent_op = NULL;
2036 }
2037
2038 if (extent_op) {
2039 spin_unlock(&delayed_refs->lock);
2040
2041 ret = run_delayed_extent_op(trans, root,
2042 ref, extent_op);
2043 BUG_ON(ret);
2044 kfree(extent_op);
2045
2046 cond_resched();
2047 spin_lock(&delayed_refs->lock);
2048 continue;
2049 }
2050
c3e69d58 2051 list_del_init(&locked_ref->cluster);
56bec294
CM
2052 locked_ref = NULL;
2053 }
02217ed2 2054
56bec294
CM
2055 ref->in_tree = 0;
2056 rb_erase(&ref->rb_node, &delayed_refs->root);
2057 delayed_refs->num_entries--;
5d4f98a2 2058
56bec294 2059 spin_unlock(&delayed_refs->lock);
925baedd 2060
5d4f98a2 2061 ret = run_one_delayed_ref(trans, root, ref, extent_op,
56bec294
CM
2062 must_insert_reserved);
2063 BUG_ON(ret);
eb099670 2064
5d4f98a2
YZ
2065 btrfs_put_delayed_ref(ref);
2066 kfree(extent_op);
c3e69d58 2067 count++;
5d4f98a2 2068
c3e69d58
CM
2069 cond_resched();
2070 spin_lock(&delayed_refs->lock);
2071 }
2072 return count;
2073}
2074
2075/*
2076 * this starts processing the delayed reference count updates and
2077 * extent insertions we have queued up so far. count can be
2078 * 0, which means to process everything in the tree at the start
2079 * of the run (but not newly added entries), or it can be some target
2080 * number you'd like to process.
2081 */
2082int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root, unsigned long count)
2084{
2085 struct rb_node *node;
2086 struct btrfs_delayed_ref_root *delayed_refs;
2087 struct btrfs_delayed_ref_node *ref;
2088 struct list_head cluster;
2089 int ret;
2090 int run_all = count == (unsigned long)-1;
2091 int run_most = 0;
2092
2093 if (root == root->fs_info->extent_root)
2094 root = root->fs_info->tree_root;
2095
2096 delayed_refs = &trans->transaction->delayed_refs;
2097 INIT_LIST_HEAD(&cluster);
2098again:
2099 spin_lock(&delayed_refs->lock);
2100 if (count == 0) {
2101 count = delayed_refs->num_entries * 2;
2102 run_most = 1;
2103 }
2104 while (1) {
2105 if (!(run_all || run_most) &&
2106 delayed_refs->num_heads_ready < 64)
2107 break;
eb099670 2108
56bec294 2109 /*
c3e69d58
CM
2110 * go find something we can process in the rbtree. We start at
2111 * the beginning of the tree, and then build a cluster
2112 * of refs to process starting at the first one we are able to
2113 * lock
56bec294 2114 */
c3e69d58
CM
2115 ret = btrfs_find_ref_cluster(trans, &cluster,
2116 delayed_refs->run_delayed_start);
2117 if (ret)
56bec294
CM
2118 break;
2119
c3e69d58
CM
2120 ret = run_clustered_refs(trans, root, &cluster);
2121 BUG_ON(ret < 0);
2122
2123 count -= min_t(unsigned long, ret, count);
2124
2125 if (count == 0)
2126 break;
eb099670 2127 }
c3e69d58 2128
56bec294 2129 if (run_all) {
56bec294 2130 node = rb_first(&delayed_refs->root);
c3e69d58 2131 if (!node)
56bec294 2132 goto out;
c3e69d58 2133 count = (unsigned long)-1;
e9d0b13b 2134
56bec294
CM
2135 while (node) {
2136 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2137 rb_node);
2138 if (btrfs_delayed_ref_is_head(ref)) {
2139 struct btrfs_delayed_ref_head *head;
5caf2a00 2140
56bec294
CM
2141 head = btrfs_delayed_node_to_head(ref);
2142 atomic_inc(&ref->refs);
2143
2144 spin_unlock(&delayed_refs->lock);
2145 mutex_lock(&head->mutex);
2146 mutex_unlock(&head->mutex);
2147
2148 btrfs_put_delayed_ref(ref);
1887be66 2149 cond_resched();
56bec294
CM
2150 goto again;
2151 }
2152 node = rb_next(node);
2153 }
2154 spin_unlock(&delayed_refs->lock);
56bec294
CM
2155 schedule_timeout(1);
2156 goto again;
5f39d397 2157 }
54aa1f4d 2158out:
c3e69d58 2159 spin_unlock(&delayed_refs->lock);
a28ec197
CM
2160 return 0;
2161}
2162
5d4f98a2
YZ
2163int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2164 struct btrfs_root *root,
2165 u64 bytenr, u64 num_bytes, u64 flags,
2166 int is_data)
2167{
2168 struct btrfs_delayed_extent_op *extent_op;
2169 int ret;
2170
2171 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2172 if (!extent_op)
2173 return -ENOMEM;
2174
2175 extent_op->flags_to_set = flags;
2176 extent_op->update_flags = 1;
2177 extent_op->update_key = 0;
2178 extent_op->is_data = is_data ? 1 : 0;
2179
2180 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2181 if (ret)
2182 kfree(extent_op);
2183 return ret;
2184}
2185
2186static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2187 struct btrfs_root *root,
2188 struct btrfs_path *path,
2189 u64 objectid, u64 offset, u64 bytenr)
2190{
2191 struct btrfs_delayed_ref_head *head;
2192 struct btrfs_delayed_ref_node *ref;
2193 struct btrfs_delayed_data_ref *data_ref;
2194 struct btrfs_delayed_ref_root *delayed_refs;
2195 struct rb_node *node;
2196 int ret = 0;
2197
2198 ret = -ENOENT;
2199 delayed_refs = &trans->transaction->delayed_refs;
2200 spin_lock(&delayed_refs->lock);
2201 head = btrfs_find_delayed_ref_head(trans, bytenr);
2202 if (!head)
2203 goto out;
2204
2205 if (!mutex_trylock(&head->mutex)) {
2206 atomic_inc(&head->node.refs);
2207 spin_unlock(&delayed_refs->lock);
2208
2209 btrfs_release_path(root->fs_info->extent_root, path);
2210
2211 mutex_lock(&head->mutex);
2212 mutex_unlock(&head->mutex);
2213 btrfs_put_delayed_ref(&head->node);
2214 return -EAGAIN;
2215 }
2216
2217 node = rb_prev(&head->node.rb_node);
2218 if (!node)
2219 goto out_unlock;
2220
2221 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2222
2223 if (ref->bytenr != bytenr)
2224 goto out_unlock;
2225
2226 ret = 1;
2227 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2228 goto out_unlock;
2229
2230 data_ref = btrfs_delayed_node_to_data_ref(ref);
2231
2232 node = rb_prev(node);
2233 if (node) {
2234 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2235 if (ref->bytenr == bytenr)
2236 goto out_unlock;
2237 }
2238
2239 if (data_ref->root != root->root_key.objectid ||
2240 data_ref->objectid != objectid || data_ref->offset != offset)
2241 goto out_unlock;
2242
2243 ret = 0;
2244out_unlock:
2245 mutex_unlock(&head->mutex);
2246out:
2247 spin_unlock(&delayed_refs->lock);
2248 return ret;
2249}
2250
2251static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2252 struct btrfs_root *root,
2253 struct btrfs_path *path,
2254 u64 objectid, u64 offset, u64 bytenr)
be20aa9d
CM
2255{
2256 struct btrfs_root *extent_root = root->fs_info->extent_root;
f321e491 2257 struct extent_buffer *leaf;
5d4f98a2
YZ
2258 struct btrfs_extent_data_ref *ref;
2259 struct btrfs_extent_inline_ref *iref;
2260 struct btrfs_extent_item *ei;
f321e491 2261 struct btrfs_key key;
5d4f98a2 2262 u32 item_size;
be20aa9d 2263 int ret;
925baedd 2264
be20aa9d 2265 key.objectid = bytenr;
31840ae1 2266 key.offset = (u64)-1;
f321e491 2267 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2268
be20aa9d
CM
2269 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2270 if (ret < 0)
2271 goto out;
2272 BUG_ON(ret == 0);
80ff3856
YZ
2273
2274 ret = -ENOENT;
2275 if (path->slots[0] == 0)
31840ae1 2276 goto out;
be20aa9d 2277
31840ae1 2278 path->slots[0]--;
f321e491 2279 leaf = path->nodes[0];
5d4f98a2 2280 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2281
5d4f98a2 2282 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2283 goto out;
f321e491 2284
5d4f98a2
YZ
2285 ret = 1;
2286 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2287#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2288 if (item_size < sizeof(*ei)) {
2289 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2290 goto out;
2291 }
2292#endif
2293 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 2294
5d4f98a2
YZ
2295 if (item_size != sizeof(*ei) +
2296 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2297 goto out;
be20aa9d 2298
5d4f98a2
YZ
2299 if (btrfs_extent_generation(leaf, ei) <=
2300 btrfs_root_last_snapshot(&root->root_item))
2301 goto out;
2302
2303 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2304 if (btrfs_extent_inline_ref_type(leaf, iref) !=
2305 BTRFS_EXTENT_DATA_REF_KEY)
2306 goto out;
2307
2308 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2309 if (btrfs_extent_refs(leaf, ei) !=
2310 btrfs_extent_data_ref_count(leaf, ref) ||
2311 btrfs_extent_data_ref_root(leaf, ref) !=
2312 root->root_key.objectid ||
2313 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2314 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2315 goto out;
2316
2317 ret = 0;
2318out:
2319 return ret;
2320}
2321
2322int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2323 struct btrfs_root *root,
2324 u64 objectid, u64 offset, u64 bytenr)
2325{
2326 struct btrfs_path *path;
2327 int ret;
2328 int ret2;
2329
2330 path = btrfs_alloc_path();
2331 if (!path)
2332 return -ENOENT;
2333
2334 do {
2335 ret = check_committed_ref(trans, root, path, objectid,
2336 offset, bytenr);
2337 if (ret && ret != -ENOENT)
f321e491 2338 goto out;
80ff3856 2339
5d4f98a2
YZ
2340 ret2 = check_delayed_ref(trans, root, path, objectid,
2341 offset, bytenr);
2342 } while (ret2 == -EAGAIN);
2343
2344 if (ret2 && ret2 != -ENOENT) {
2345 ret = ret2;
2346 goto out;
f321e491 2347 }
5d4f98a2
YZ
2348
2349 if (ret != -ENOENT || ret2 != -ENOENT)
2350 ret = 0;
be20aa9d 2351out:
80ff3856 2352 btrfs_free_path(path);
f321e491 2353 return ret;
be20aa9d 2354}
c5739bba 2355
5d4f98a2 2356#if 0
31840ae1
ZY
2357int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2358 struct extent_buffer *buf, u32 nr_extents)
02217ed2 2359{
5f39d397 2360 struct btrfs_key key;
6407bf6d 2361 struct btrfs_file_extent_item *fi;
e4657689
ZY
2362 u64 root_gen;
2363 u32 nritems;
02217ed2 2364 int i;
db94535d 2365 int level;
31840ae1 2366 int ret = 0;
e4657689 2367 int shared = 0;
a28ec197 2368
3768f368 2369 if (!root->ref_cows)
a28ec197 2370 return 0;
5f39d397 2371
e4657689
ZY
2372 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2373 shared = 0;
2374 root_gen = root->root_key.offset;
2375 } else {
2376 shared = 1;
2377 root_gen = trans->transid - 1;
2378 }
2379
db94535d 2380 level = btrfs_header_level(buf);
5f39d397 2381 nritems = btrfs_header_nritems(buf);
4a096752 2382
31840ae1 2383 if (level == 0) {
31153d81
YZ
2384 struct btrfs_leaf_ref *ref;
2385 struct btrfs_extent_info *info;
2386
31840ae1 2387 ref = btrfs_alloc_leaf_ref(root, nr_extents);
31153d81 2388 if (!ref) {
31840ae1 2389 ret = -ENOMEM;
31153d81
YZ
2390 goto out;
2391 }
2392
e4657689 2393 ref->root_gen = root_gen;
31153d81
YZ
2394 ref->bytenr = buf->start;
2395 ref->owner = btrfs_header_owner(buf);
2396 ref->generation = btrfs_header_generation(buf);
31840ae1 2397 ref->nritems = nr_extents;
31153d81 2398 info = ref->extents;
bcc63abb 2399
31840ae1 2400 for (i = 0; nr_extents > 0 && i < nritems; i++) {
31153d81
YZ
2401 u64 disk_bytenr;
2402 btrfs_item_key_to_cpu(buf, &key, i);
2403 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2404 continue;
2405 fi = btrfs_item_ptr(buf, i,
2406 struct btrfs_file_extent_item);
2407 if (btrfs_file_extent_type(buf, fi) ==
2408 BTRFS_FILE_EXTENT_INLINE)
2409 continue;
2410 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2411 if (disk_bytenr == 0)
2412 continue;
2413
2414 info->bytenr = disk_bytenr;
2415 info->num_bytes =
2416 btrfs_file_extent_disk_num_bytes(buf, fi);
2417 info->objectid = key.objectid;
2418 info->offset = key.offset;
2419 info++;
2420 }
2421
e4657689 2422 ret = btrfs_add_leaf_ref(root, ref, shared);
5b84e8d6
YZ
2423 if (ret == -EEXIST && shared) {
2424 struct btrfs_leaf_ref *old;
2425 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2426 BUG_ON(!old);
2427 btrfs_remove_leaf_ref(root, old);
2428 btrfs_free_leaf_ref(root, old);
2429 ret = btrfs_add_leaf_ref(root, ref, shared);
2430 }
31153d81 2431 WARN_ON(ret);
bcc63abb 2432 btrfs_free_leaf_ref(root, ref);
31153d81
YZ
2433 }
2434out:
31840ae1
ZY
2435 return ret;
2436}
2437
b7a9f29f
CM
2438/* when a block goes through cow, we update the reference counts of
2439 * everything that block points to. The internal pointers of the block
2440 * can be in just about any order, and it is likely to have clusters of
2441 * things that are close together and clusters of things that are not.
2442 *
2443 * To help reduce the seeks that come with updating all of these reference
2444 * counts, sort them by byte number before actual updates are done.
2445 *
2446 * struct refsort is used to match byte number to slot in the btree block.
2447 * we sort based on the byte number and then use the slot to actually
2448 * find the item.
bd56b302
CM
2449 *
2450 * struct refsort is smaller than strcut btrfs_item and smaller than
2451 * struct btrfs_key_ptr. Since we're currently limited to the page size
2452 * for a btree block, there's no way for a kmalloc of refsorts for a
2453 * single node to be bigger than a page.
b7a9f29f
CM
2454 */
2455struct refsort {
2456 u64 bytenr;
2457 u32 slot;
2458};
2459
2460/*
2461 * for passing into sort()
2462 */
2463static int refsort_cmp(const void *a_void, const void *b_void)
2464{
2465 const struct refsort *a = a_void;
2466 const struct refsort *b = b_void;
2467
2468 if (a->bytenr < b->bytenr)
2469 return -1;
2470 if (a->bytenr > b->bytenr)
2471 return 1;
2472 return 0;
2473}
5d4f98a2 2474#endif
b7a9f29f 2475
5d4f98a2 2476static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 2477 struct btrfs_root *root,
5d4f98a2
YZ
2478 struct extent_buffer *buf,
2479 int full_backref, int inc)
31840ae1
ZY
2480{
2481 u64 bytenr;
5d4f98a2
YZ
2482 u64 num_bytes;
2483 u64 parent;
31840ae1 2484 u64 ref_root;
31840ae1 2485 u32 nritems;
31840ae1
ZY
2486 struct btrfs_key key;
2487 struct btrfs_file_extent_item *fi;
2488 int i;
2489 int level;
2490 int ret = 0;
31840ae1 2491 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
5d4f98a2 2492 u64, u64, u64, u64, u64, u64);
31840ae1
ZY
2493
2494 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
2495 nritems = btrfs_header_nritems(buf);
2496 level = btrfs_header_level(buf);
2497
5d4f98a2
YZ
2498 if (!root->ref_cows && level == 0)
2499 return 0;
31840ae1 2500
5d4f98a2
YZ
2501 if (inc)
2502 process_func = btrfs_inc_extent_ref;
2503 else
2504 process_func = btrfs_free_extent;
31840ae1 2505
5d4f98a2
YZ
2506 if (full_backref)
2507 parent = buf->start;
2508 else
2509 parent = 0;
2510
2511 for (i = 0; i < nritems; i++) {
31840ae1 2512 if (level == 0) {
5d4f98a2 2513 btrfs_item_key_to_cpu(buf, &key, i);
31840ae1
ZY
2514 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2515 continue;
5d4f98a2 2516 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
2517 struct btrfs_file_extent_item);
2518 if (btrfs_file_extent_type(buf, fi) ==
2519 BTRFS_FILE_EXTENT_INLINE)
2520 continue;
2521 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2522 if (bytenr == 0)
2523 continue;
5d4f98a2
YZ
2524
2525 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2526 key.offset -= btrfs_file_extent_offset(buf, fi);
2527 ret = process_func(trans, root, bytenr, num_bytes,
2528 parent, ref_root, key.objectid,
2529 key.offset);
31840ae1
ZY
2530 if (ret)
2531 goto fail;
2532 } else {
5d4f98a2
YZ
2533 bytenr = btrfs_node_blockptr(buf, i);
2534 num_bytes = btrfs_level_size(root, level - 1);
2535 ret = process_func(trans, root, bytenr, num_bytes,
2536 parent, ref_root, level - 1, 0);
31840ae1
ZY
2537 if (ret)
2538 goto fail;
2539 }
2540 }
2541 return 0;
2542fail:
5d4f98a2
YZ
2543 BUG();
2544 return ret;
2545}
2546
2547int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2548 struct extent_buffer *buf, int full_backref)
2549{
2550 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2551}
2552
2553int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2554 struct extent_buffer *buf, int full_backref)
2555{
2556 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
2557}
2558
9078a3e1
CM
2559static int write_one_cache_group(struct btrfs_trans_handle *trans,
2560 struct btrfs_root *root,
2561 struct btrfs_path *path,
2562 struct btrfs_block_group_cache *cache)
2563{
2564 int ret;
9078a3e1 2565 struct btrfs_root *extent_root = root->fs_info->extent_root;
5f39d397
CM
2566 unsigned long bi;
2567 struct extent_buffer *leaf;
9078a3e1 2568
9078a3e1 2569 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
54aa1f4d
CM
2570 if (ret < 0)
2571 goto fail;
9078a3e1 2572 BUG_ON(ret);
5f39d397
CM
2573
2574 leaf = path->nodes[0];
2575 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2576 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2577 btrfs_mark_buffer_dirty(leaf);
9078a3e1 2578 btrfs_release_path(extent_root, path);
54aa1f4d 2579fail:
9078a3e1
CM
2580 if (ret)
2581 return ret;
9078a3e1
CM
2582 return 0;
2583
2584}
2585
4a8c9a62
YZ
2586static struct btrfs_block_group_cache *
2587next_block_group(struct btrfs_root *root,
2588 struct btrfs_block_group_cache *cache)
2589{
2590 struct rb_node *node;
2591 spin_lock(&root->fs_info->block_group_cache_lock);
2592 node = rb_next(&cache->cache_node);
2593 btrfs_put_block_group(cache);
2594 if (node) {
2595 cache = rb_entry(node, struct btrfs_block_group_cache,
2596 cache_node);
11dfe35a 2597 btrfs_get_block_group(cache);
4a8c9a62
YZ
2598 } else
2599 cache = NULL;
2600 spin_unlock(&root->fs_info->block_group_cache_lock);
2601 return cache;
2602}
2603
96b5179d
CM
2604int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2605 struct btrfs_root *root)
9078a3e1 2606{
4a8c9a62 2607 struct btrfs_block_group_cache *cache;
9078a3e1 2608 int err = 0;
9078a3e1 2609 struct btrfs_path *path;
96b5179d 2610 u64 last = 0;
9078a3e1
CM
2611
2612 path = btrfs_alloc_path();
2613 if (!path)
2614 return -ENOMEM;
2615
d397712b 2616 while (1) {
4a8c9a62
YZ
2617 if (last == 0) {
2618 err = btrfs_run_delayed_refs(trans, root,
2619 (unsigned long)-1);
2620 BUG_ON(err);
0f9dd46c 2621 }
54aa1f4d 2622
4a8c9a62
YZ
2623 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2624 while (cache) {
2625 if (cache->dirty)
2626 break;
2627 cache = next_block_group(root, cache);
2628 }
2629 if (!cache) {
2630 if (last == 0)
2631 break;
2632 last = 0;
2633 continue;
2634 }
0f9dd46c 2635
e8569813 2636 cache->dirty = 0;
4a8c9a62 2637 last = cache->key.objectid + cache->key.offset;
0f9dd46c 2638
4a8c9a62
YZ
2639 err = write_one_cache_group(trans, root, path, cache);
2640 BUG_ON(err);
2641 btrfs_put_block_group(cache);
9078a3e1 2642 }
4a8c9a62 2643
9078a3e1 2644 btrfs_free_path(path);
4a8c9a62 2645 return 0;
9078a3e1
CM
2646}
2647
d2fb3437
YZ
2648int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2649{
2650 struct btrfs_block_group_cache *block_group;
2651 int readonly = 0;
2652
2653 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2654 if (!block_group || block_group->ro)
2655 readonly = 1;
2656 if (block_group)
fa9c0d79 2657 btrfs_put_block_group(block_group);
d2fb3437
YZ
2658 return readonly;
2659}
2660
593060d7
CM
2661static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2662 u64 total_bytes, u64 bytes_used,
2663 struct btrfs_space_info **space_info)
2664{
2665 struct btrfs_space_info *found;
b742bb82
YZ
2666 int i;
2667 int factor;
2668
2669 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2670 BTRFS_BLOCK_GROUP_RAID10))
2671 factor = 2;
2672 else
2673 factor = 1;
593060d7
CM
2674
2675 found = __find_space_info(info, flags);
2676 if (found) {
25179201 2677 spin_lock(&found->lock);
593060d7
CM
2678 found->total_bytes += total_bytes;
2679 found->bytes_used += bytes_used;
b742bb82 2680 found->disk_used += bytes_used * factor;
8f18cf13 2681 found->full = 0;
25179201 2682 spin_unlock(&found->lock);
593060d7
CM
2683 *space_info = found;
2684 return 0;
2685 }
c146afad 2686 found = kzalloc(sizeof(*found), GFP_NOFS);
593060d7
CM
2687 if (!found)
2688 return -ENOMEM;
2689
b742bb82
YZ
2690 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2691 INIT_LIST_HEAD(&found->block_groups[i]);
80eb234a 2692 init_rwsem(&found->groups_sem);
b5cb1600
JB
2693 init_waitqueue_head(&found->flush_wait);
2694 init_waitqueue_head(&found->allocate_wait);
0f9dd46c 2695 spin_lock_init(&found->lock);
b742bb82
YZ
2696 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2697 BTRFS_BLOCK_GROUP_SYSTEM |
2698 BTRFS_BLOCK_GROUP_METADATA);
593060d7
CM
2699 found->total_bytes = total_bytes;
2700 found->bytes_used = bytes_used;
b742bb82 2701 found->disk_used = bytes_used * factor;
593060d7 2702 found->bytes_pinned = 0;
e8569813 2703 found->bytes_reserved = 0;
c146afad 2704 found->bytes_readonly = 0;
6a63209f 2705 found->bytes_delalloc = 0;
593060d7 2706 found->full = 0;
0ef3e66b 2707 found->force_alloc = 0;
593060d7 2708 *space_info = found;
4184ea7f 2709 list_add_rcu(&found->list, &info->space_info);
817d52f8 2710 atomic_set(&found->caching_threads, 0);
593060d7
CM
2711 return 0;
2712}
2713
8790d502
CM
2714static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2715{
2716 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
611f0e00 2717 BTRFS_BLOCK_GROUP_RAID1 |
321aecc6 2718 BTRFS_BLOCK_GROUP_RAID10 |
611f0e00 2719 BTRFS_BLOCK_GROUP_DUP);
8790d502
CM
2720 if (extra_flags) {
2721 if (flags & BTRFS_BLOCK_GROUP_DATA)
2722 fs_info->avail_data_alloc_bits |= extra_flags;
2723 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2724 fs_info->avail_metadata_alloc_bits |= extra_flags;
2725 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2726 fs_info->avail_system_alloc_bits |= extra_flags;
2727 }
2728}
593060d7 2729
c146afad
YZ
2730static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2731{
2732 spin_lock(&cache->space_info->lock);
2733 spin_lock(&cache->lock);
2734 if (!cache->ro) {
2735 cache->space_info->bytes_readonly += cache->key.offset -
2736 btrfs_block_group_used(&cache->item);
2737 cache->ro = 1;
2738 }
2739 spin_unlock(&cache->lock);
2740 spin_unlock(&cache->space_info->lock);
2741}
2742
2b82032c 2743u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
ec44a35c 2744{
2b82032c 2745 u64 num_devices = root->fs_info->fs_devices->rw_devices;
a061fc8d
CM
2746
2747 if (num_devices == 1)
2748 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2749 if (num_devices < 4)
2750 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2751
ec44a35c
CM
2752 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2753 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
a061fc8d 2754 BTRFS_BLOCK_GROUP_RAID10))) {
ec44a35c 2755 flags &= ~BTRFS_BLOCK_GROUP_DUP;
a061fc8d 2756 }
ec44a35c
CM
2757
2758 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
a061fc8d 2759 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
ec44a35c 2760 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
a061fc8d 2761 }
ec44a35c
CM
2762
2763 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2764 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2765 (flags & BTRFS_BLOCK_GROUP_RAID10) |
2766 (flags & BTRFS_BLOCK_GROUP_DUP)))
2767 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2768 return flags;
2769}
2770
b742bb82 2771static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
6a63209f 2772{
b742bb82
YZ
2773 if (flags & BTRFS_BLOCK_GROUP_DATA)
2774 flags |= root->fs_info->avail_data_alloc_bits &
2775 root->fs_info->data_alloc_profile;
2776 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2777 flags |= root->fs_info->avail_system_alloc_bits &
2778 root->fs_info->system_alloc_profile;
2779 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2780 flags |= root->fs_info->avail_metadata_alloc_bits &
2781 root->fs_info->metadata_alloc_profile;
2782 return btrfs_reduce_alloc_profile(root, flags);
2783}
2784
2785static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2786{
2787 u64 flags;
6a63209f 2788
b742bb82
YZ
2789 if (data)
2790 flags = BTRFS_BLOCK_GROUP_DATA;
2791 else if (root == root->fs_info->chunk_root)
2792 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2793 else
2794 flags = BTRFS_BLOCK_GROUP_METADATA;
2795
2796 return get_alloc_profile(root, flags);
6a63209f
JB
2797}
2798
2799void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2800{
2801 u64 alloc_target;
2802
2803 alloc_target = btrfs_get_alloc_profile(root, 1);
2804 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2805 alloc_target);
2806}
2807
9ed74f2d
JB
2808static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2809{
2810 u64 num_bytes;
2811 int level;
2812
2813 level = BTRFS_MAX_LEVEL - 2;
2814 /*
2815 * NOTE: these calculations are absolutely the worst possible case.
2816 * This assumes that _every_ item we insert will require a new leaf, and
2817 * that the tree has grown to its maximum level size.
2818 */
2819
2820 /*
2821 * for every item we insert we could insert both an extent item and a
2822 * extent ref item. Then for ever item we insert, we will need to cow
2823 * both the original leaf, plus the leaf to the left and right of it.
2824 *
2825 * Unless we are talking about the extent root, then we just want the
2826 * number of items * 2, since we just need the extent item plus its ref.
2827 */
2828 if (root == root->fs_info->extent_root)
2829 num_bytes = num_items * 2;
2830 else
2831 num_bytes = (num_items + (2 * num_items)) * 3;
2832
2833 /*
2834 * num_bytes is total number of leaves we could need times the leaf
2835 * size, and then for every leaf we could end up cow'ing 2 nodes per
2836 * level, down to the leaf level.
2837 */
2838 num_bytes = (num_bytes * root->leafsize) +
2839 (num_bytes * (level * 2)) * root->nodesize;
2840
2841 return num_bytes;
2842}
2843
6a63209f 2844/*
9ed74f2d
JB
2845 * Unreserve metadata space for delalloc. If we have less reserved credits than
2846 * we have extents, this function does nothing.
6a63209f 2847 */
9ed74f2d
JB
2848int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2849 struct inode *inode, int num_items)
6a63209f
JB
2850{
2851 struct btrfs_fs_info *info = root->fs_info;
2852 struct btrfs_space_info *meta_sinfo;
9ed74f2d
JB
2853 u64 num_bytes;
2854 u64 alloc_target;
2855 bool bug = false;
6a63209f
JB
2856
2857 /* get the space info for where the metadata will live */
2858 alloc_target = btrfs_get_alloc_profile(root, 0);
2859 meta_sinfo = __find_space_info(info, alloc_target);
2860
9ed74f2d
JB
2861 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2862 num_items);
2863
6a63209f 2864 spin_lock(&meta_sinfo->lock);
32c00aff
JB
2865 spin_lock(&BTRFS_I(inode)->accounting_lock);
2866 if (BTRFS_I(inode)->reserved_extents <=
2867 BTRFS_I(inode)->outstanding_extents) {
2868 spin_unlock(&BTRFS_I(inode)->accounting_lock);
9ed74f2d
JB
2869 spin_unlock(&meta_sinfo->lock);
2870 return 0;
2871 }
32c00aff 2872 spin_unlock(&BTRFS_I(inode)->accounting_lock);
9ed74f2d 2873
287a0ab9 2874 BTRFS_I(inode)->reserved_extents -= num_items;
32c00aff 2875 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
9ed74f2d
JB
2876
2877 if (meta_sinfo->bytes_delalloc < num_bytes) {
2878 bug = true;
2879 meta_sinfo->bytes_delalloc = 0;
2880 } else {
2881 meta_sinfo->bytes_delalloc -= num_bytes;
2882 }
2883 spin_unlock(&meta_sinfo->lock);
2884
2885 BUG_ON(bug);
2886
2887 return 0;
2888}
6a63209f 2889
9ed74f2d
JB
2890static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2891{
2892 u64 thresh;
2893
2894 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2895 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2896 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2897 meta_sinfo->bytes_may_use;
6a63209f 2898
9ed74f2d
JB
2899 thresh = meta_sinfo->total_bytes - thresh;
2900 thresh *= 80;
6a63209f 2901 do_div(thresh, 100);
9ed74f2d
JB
2902 if (thresh <= meta_sinfo->bytes_delalloc)
2903 meta_sinfo->force_delalloc = 1;
2904 else
2905 meta_sinfo->force_delalloc = 0;
2906}
6a63209f 2907
e3ccfa98
JB
2908struct async_flush {
2909 struct btrfs_root *root;
2910 struct btrfs_space_info *info;
2911 struct btrfs_work work;
2912};
2913
2914static noinline void flush_delalloc_async(struct btrfs_work *work)
2915{
2916 struct async_flush *async;
2917 struct btrfs_root *root;
2918 struct btrfs_space_info *info;
2919
2920 async = container_of(work, struct async_flush, work);
2921 root = async->root;
2922 info = async->info;
2923
24bbcf04 2924 btrfs_start_delalloc_inodes(root, 0);
e3ccfa98 2925 wake_up(&info->flush_wait);
24bbcf04 2926 btrfs_wait_ordered_extents(root, 0, 0);
e3ccfa98
JB
2927
2928 spin_lock(&info->lock);
2929 info->flushing = 0;
2930 spin_unlock(&info->lock);
2931 wake_up(&info->flush_wait);
2932
2933 kfree(async);
2934}
2935
2936static void wait_on_flush(struct btrfs_space_info *info)
2937{
2938 DEFINE_WAIT(wait);
2939 u64 used;
2940
2941 while (1) {
2942 prepare_to_wait(&info->flush_wait, &wait,
2943 TASK_UNINTERRUPTIBLE);
2944 spin_lock(&info->lock);
2945 if (!info->flushing) {
2946 spin_unlock(&info->lock);
2947 break;
2948 }
2949
2950 used = info->bytes_used + info->bytes_reserved +
2951 info->bytes_pinned + info->bytes_readonly +
2952 info->bytes_super + info->bytes_root +
2953 info->bytes_may_use + info->bytes_delalloc;
2954 if (used < info->total_bytes) {
2955 spin_unlock(&info->lock);
2956 break;
2957 }
2958 spin_unlock(&info->lock);
2959 schedule();
2960 }
2961 finish_wait(&info->flush_wait, &wait);
2962}
2963
32c00aff
JB
2964static void flush_delalloc(struct btrfs_root *root,
2965 struct btrfs_space_info *info)
2966{
e3ccfa98 2967 struct async_flush *async;
32c00aff
JB
2968 bool wait = false;
2969
2970 spin_lock(&info->lock);
2971
b5cb1600 2972 if (!info->flushing)
32c00aff 2973 info->flushing = 1;
b5cb1600 2974 else
32c00aff 2975 wait = true;
32c00aff
JB
2976
2977 spin_unlock(&info->lock);
2978
2979 if (wait) {
e3ccfa98 2980 wait_on_flush(info);
32c00aff
JB
2981 return;
2982 }
2983
e3ccfa98
JB
2984 async = kzalloc(sizeof(*async), GFP_NOFS);
2985 if (!async)
2986 goto flush;
2987
2988 async->root = root;
2989 async->info = info;
2990 async->work.func = flush_delalloc_async;
2991
2992 btrfs_queue_worker(&root->fs_info->enospc_workers,
2993 &async->work);
2994 wait_on_flush(info);
2995 return;
2996
2997flush:
24bbcf04
YZ
2998 btrfs_start_delalloc_inodes(root, 0);
2999 btrfs_wait_ordered_extents(root, 0, 0);
32c00aff
JB
3000
3001 spin_lock(&info->lock);
3002 info->flushing = 0;
3003 spin_unlock(&info->lock);
3004 wake_up(&info->flush_wait);
3005}
3006
9ed74f2d
JB
3007static int maybe_allocate_chunk(struct btrfs_root *root,
3008 struct btrfs_space_info *info)
3009{
3010 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
3011 struct btrfs_trans_handle *trans;
3012 bool wait = false;
3013 int ret = 0;
3014 u64 min_metadata;
3015 u64 free_space;
3016
3017 free_space = btrfs_super_total_bytes(disk_super);
3018 /*
33b25808 3019 * we allow the metadata to grow to a max of either 10gb or 5% of the
9ed74f2d
JB
3020 * space in the volume.
3021 */
33b25808 3022 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
9ed74f2d
JB
3023 div64_u64(free_space * 5, 100));
3024 if (info->total_bytes >= min_metadata) {
3025 spin_unlock(&info->lock);
3026 return 0;
3027 }
3028
3029 if (info->full) {
3030 spin_unlock(&info->lock);
3031 return 0;
3032 }
3033
3034 if (!info->allocating_chunk) {
3035 info->force_alloc = 1;
3036 info->allocating_chunk = 1;
9ed74f2d
JB
3037 } else {
3038 wait = true;
3039 }
3040
3041 spin_unlock(&info->lock);
3042
3043 if (wait) {
e3ccfa98 3044 wait_event(info->allocate_wait,
9ed74f2d
JB
3045 !info->allocating_chunk);
3046 return 1;
3047 }
3048
3049 trans = btrfs_start_transaction(root, 1);
3050 if (!trans) {
3051 ret = -ENOMEM;
3052 goto out;
3053 }
3054
3055 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3056 4096 + 2 * 1024 * 1024,
3057 info->flags, 0);
3058 btrfs_end_transaction(trans, root);
3059 if (ret)
3060 goto out;
3061out:
3062 spin_lock(&info->lock);
3063 info->allocating_chunk = 0;
3064 spin_unlock(&info->lock);
e3ccfa98 3065 wake_up(&info->allocate_wait);
9ed74f2d
JB
3066
3067 if (ret)
3068 return 0;
3069 return 1;
3070}
3071
3072/*
3073 * Reserve metadata space for delalloc.
3074 */
3075int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3076 struct inode *inode, int num_items)
3077{
3078 struct btrfs_fs_info *info = root->fs_info;
3079 struct btrfs_space_info *meta_sinfo;
3080 u64 num_bytes;
3081 u64 used;
3082 u64 alloc_target;
3083 int flushed = 0;
3084 int force_delalloc;
3085
3086 /* get the space info for where the metadata will live */
3087 alloc_target = btrfs_get_alloc_profile(root, 0);
3088 meta_sinfo = __find_space_info(info, alloc_target);
3089
3090 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3091 num_items);
3092again:
3093 spin_lock(&meta_sinfo->lock);
3094
3095 force_delalloc = meta_sinfo->force_delalloc;
3096
3097 if (unlikely(!meta_sinfo->bytes_root))
3098 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3099
3100 if (!flushed)
3101 meta_sinfo->bytes_delalloc += num_bytes;
3102
3103 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3104 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3105 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3106 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3107
3108 if (used > meta_sinfo->total_bytes) {
3109 flushed++;
3110
3111 if (flushed == 1) {
3112 if (maybe_allocate_chunk(root, meta_sinfo))
3113 goto again;
3114 flushed++;
3115 } else {
4e06bdd6 3116 spin_unlock(&meta_sinfo->lock);
9ed74f2d 3117 }
4e06bdd6 3118
9ed74f2d
JB
3119 if (flushed == 2) {
3120 filemap_flush(inode->i_mapping);
3121 goto again;
3122 } else if (flushed == 3) {
32c00aff 3123 flush_delalloc(root, meta_sinfo);
4e06bdd6
JB
3124 goto again;
3125 }
9ed74f2d
JB
3126 spin_lock(&meta_sinfo->lock);
3127 meta_sinfo->bytes_delalloc -= num_bytes;
6a63209f 3128 spin_unlock(&meta_sinfo->lock);
9ed74f2d 3129 printk(KERN_ERR "enospc, has %d, reserved %d\n",
32c00aff
JB
3130 BTRFS_I(inode)->outstanding_extents,
3131 BTRFS_I(inode)->reserved_extents);
9ed74f2d
JB
3132 dump_space_info(meta_sinfo, 0, 0);
3133 return -ENOSPC;
3134 }
4e06bdd6 3135
287a0ab9 3136 BTRFS_I(inode)->reserved_extents += num_items;
9ed74f2d
JB
3137 check_force_delalloc(meta_sinfo);
3138 spin_unlock(&meta_sinfo->lock);
3139
3140 if (!flushed && force_delalloc)
3141 filemap_flush(inode->i_mapping);
3142
3143 return 0;
3144}
3145
3146/*
3147 * unreserve num_items number of items worth of metadata space. This needs to
3148 * be paired with btrfs_reserve_metadata_space.
3149 *
3150 * NOTE: if you have the option, run this _AFTER_ you do a
3151 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3152 * oprations which will result in more used metadata, so we want to make sure we
3153 * can do that without issue.
3154 */
3155int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3156{
3157 struct btrfs_fs_info *info = root->fs_info;
3158 struct btrfs_space_info *meta_sinfo;
3159 u64 num_bytes;
3160 u64 alloc_target;
3161 bool bug = false;
3162
3163 /* get the space info for where the metadata will live */
3164 alloc_target = btrfs_get_alloc_profile(root, 0);
3165 meta_sinfo = __find_space_info(info, alloc_target);
3166
3167 num_bytes = calculate_bytes_needed(root, num_items);
3168
3169 spin_lock(&meta_sinfo->lock);
3170 if (meta_sinfo->bytes_may_use < num_bytes) {
3171 bug = true;
3172 meta_sinfo->bytes_may_use = 0;
3173 } else {
3174 meta_sinfo->bytes_may_use -= num_bytes;
3175 }
3176 spin_unlock(&meta_sinfo->lock);
3177
3178 BUG_ON(bug);
3179
3180 return 0;
3181}
3182
3183/*
3184 * Reserve some metadata space for use. We'll calculate the worste case number
3185 * of bytes that would be needed to modify num_items number of items. If we
3186 * have space, fantastic, if not, you get -ENOSPC. Please call
3187 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3188 * items you reserved, since whatever metadata you needed should have already
3189 * been allocated.
3190 *
3191 * This will commit the transaction to make more space if we don't have enough
3192 * metadata space. THe only time we don't do this is if we're reserving space
3193 * inside of a transaction, then we will just return -ENOSPC and it is the
3194 * callers responsibility to handle it properly.
3195 */
3196int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3197{
3198 struct btrfs_fs_info *info = root->fs_info;
3199 struct btrfs_space_info *meta_sinfo;
3200 u64 num_bytes;
3201 u64 used;
3202 u64 alloc_target;
3203 int retries = 0;
3204
3205 /* get the space info for where the metadata will live */
3206 alloc_target = btrfs_get_alloc_profile(root, 0);
3207 meta_sinfo = __find_space_info(info, alloc_target);
3208
3209 num_bytes = calculate_bytes_needed(root, num_items);
3210again:
3211 spin_lock(&meta_sinfo->lock);
3212
3213 if (unlikely(!meta_sinfo->bytes_root))
3214 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3215
3216 if (!retries)
3217 meta_sinfo->bytes_may_use += num_bytes;
3218
3219 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3220 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3221 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3222 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3223
3224 if (used > meta_sinfo->total_bytes) {
3225 retries++;
3226 if (retries == 1) {
3227 if (maybe_allocate_chunk(root, meta_sinfo))
3228 goto again;
3229 retries++;
3230 } else {
3231 spin_unlock(&meta_sinfo->lock);
3232 }
3233
3234 if (retries == 2) {
32c00aff 3235 flush_delalloc(root, meta_sinfo);
4e06bdd6
JB
3236 goto again;
3237 }
9ed74f2d
JB
3238 spin_lock(&meta_sinfo->lock);
3239 meta_sinfo->bytes_may_use -= num_bytes;
3240 spin_unlock(&meta_sinfo->lock);
3241
3242 dump_space_info(meta_sinfo, 0, 0);
6a63209f
JB
3243 return -ENOSPC;
3244 }
9ed74f2d
JB
3245
3246 check_force_delalloc(meta_sinfo);
6a63209f
JB
3247 spin_unlock(&meta_sinfo->lock);
3248
3249 return 0;
3250}
3251
3252/*
3253 * This will check the space that the inode allocates from to make sure we have
3254 * enough space for bytes.
3255 */
3256int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3257 u64 bytes)
3258{
3259 struct btrfs_space_info *data_sinfo;
ab6e2410
JB
3260 u64 used;
3261 int ret = 0, committed = 0, flushed = 0;
6a63209f
JB
3262
3263 /* make sure bytes are sectorsize aligned */
3264 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3265
3266 data_sinfo = BTRFS_I(inode)->space_info;
33b4d47f
CM
3267 if (!data_sinfo)
3268 goto alloc;
3269
6a63209f
JB
3270again:
3271 /* make sure we have enough space to handle the data first */
3272 spin_lock(&data_sinfo->lock);
ab6e2410
JB
3273 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3274 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3275 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3276 data_sinfo->bytes_super;
3277
3278 if (used + bytes > data_sinfo->total_bytes) {
4e06bdd6
JB
3279 struct btrfs_trans_handle *trans;
3280
ab6e2410
JB
3281 if (!flushed) {
3282 spin_unlock(&data_sinfo->lock);
3283 flush_delalloc(root, data_sinfo);
3284 flushed = 1;
3285 goto again;
3286 }
3287
6a63209f
JB
3288 /*
3289 * if we don't have enough free bytes in this space then we need
3290 * to alloc a new chunk.
3291 */
3292 if (!data_sinfo->full) {
3293 u64 alloc_target;
6a63209f
JB
3294
3295 data_sinfo->force_alloc = 1;
3296 spin_unlock(&data_sinfo->lock);
33b4d47f 3297alloc:
6a63209f
JB
3298 alloc_target = btrfs_get_alloc_profile(root, 1);
3299 trans = btrfs_start_transaction(root, 1);
3300 if (!trans)
3301 return -ENOMEM;
3302
3303 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3304 bytes + 2 * 1024 * 1024,
3305 alloc_target, 0);
3306 btrfs_end_transaction(trans, root);
3307 if (ret)
3308 return ret;
33b4d47f
CM
3309
3310 if (!data_sinfo) {
3311 btrfs_set_inode_space_info(root, inode);
3312 data_sinfo = BTRFS_I(inode)->space_info;
3313 }
6a63209f
JB
3314 goto again;
3315 }
3316 spin_unlock(&data_sinfo->lock);
4e06bdd6
JB
3317
3318 /* commit the current transaction and try again */
dd7e0b7b 3319 if (!committed && !root->fs_info->open_ioctl_trans) {
4e06bdd6
JB
3320 committed = 1;
3321 trans = btrfs_join_transaction(root, 1);
3322 if (!trans)
3323 return -ENOMEM;
3324 ret = btrfs_commit_transaction(trans, root);
3325 if (ret)
3326 return ret;
3327 goto again;
3328 }
3329
6a63209f
JB
3330 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3331 ", %llu bytes_used, %llu bytes_reserved, "
68f5a38c 3332 "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
21380931
JB
3333 "%llu total\n", (unsigned long long)bytes,
3334 (unsigned long long)data_sinfo->bytes_delalloc,
3335 (unsigned long long)data_sinfo->bytes_used,
3336 (unsigned long long)data_sinfo->bytes_reserved,
3337 (unsigned long long)data_sinfo->bytes_pinned,
3338 (unsigned long long)data_sinfo->bytes_readonly,
3339 (unsigned long long)data_sinfo->bytes_may_use,
3340 (unsigned long long)data_sinfo->total_bytes);
6a63209f
JB
3341 return -ENOSPC;
3342 }
3343 data_sinfo->bytes_may_use += bytes;
3344 BTRFS_I(inode)->reserved_bytes += bytes;
3345 spin_unlock(&data_sinfo->lock);
3346
9ed74f2d 3347 return 0;
6a63209f
JB
3348}
3349
3350/*
3351 * if there was an error for whatever reason after calling
3352 * btrfs_check_data_free_space, call this so we can cleanup the counters.
3353 */
3354void btrfs_free_reserved_data_space(struct btrfs_root *root,
3355 struct inode *inode, u64 bytes)
3356{
3357 struct btrfs_space_info *data_sinfo;
3358
3359 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3361
3362 data_sinfo = BTRFS_I(inode)->space_info;
3363 spin_lock(&data_sinfo->lock);
3364 data_sinfo->bytes_may_use -= bytes;
3365 BTRFS_I(inode)->reserved_bytes -= bytes;
3366 spin_unlock(&data_sinfo->lock);
3367}
3368
3369/* called when we are adding a delalloc extent to the inode's io_tree */
3370void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3371 u64 bytes)
3372{
3373 struct btrfs_space_info *data_sinfo;
3374
3375 /* get the space info for where this inode will be storing its data */
3376 data_sinfo = BTRFS_I(inode)->space_info;
3377
3378 /* make sure we have enough space to handle the data first */
3379 spin_lock(&data_sinfo->lock);
3380 data_sinfo->bytes_delalloc += bytes;
3381
3382 /*
3383 * we are adding a delalloc extent without calling
3384 * btrfs_check_data_free_space first. This happens on a weird
3385 * writepage condition, but shouldn't hurt our accounting
3386 */
3387 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3388 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3389 BTRFS_I(inode)->reserved_bytes = 0;
3390 } else {
3391 data_sinfo->bytes_may_use -= bytes;
3392 BTRFS_I(inode)->reserved_bytes -= bytes;
3393 }
3394
3395 spin_unlock(&data_sinfo->lock);
3396}
3397
3398/* called when we are clearing an delalloc extent from the inode's io_tree */
3399void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3400 u64 bytes)
3401{
3402 struct btrfs_space_info *info;
3403
3404 info = BTRFS_I(inode)->space_info;
3405
3406 spin_lock(&info->lock);
3407 info->bytes_delalloc -= bytes;
3408 spin_unlock(&info->lock);
3409}
3410
97e728d4
JB
3411static void force_metadata_allocation(struct btrfs_fs_info *info)
3412{
3413 struct list_head *head = &info->space_info;
3414 struct btrfs_space_info *found;
3415
3416 rcu_read_lock();
3417 list_for_each_entry_rcu(found, head, list) {
3418 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3419 found->force_alloc = 1;
3420 }
3421 rcu_read_unlock();
3422}
3423
6324fbf3
CM
3424static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3425 struct btrfs_root *extent_root, u64 alloc_bytes,
0ef3e66b 3426 u64 flags, int force)
6324fbf3
CM
3427{
3428 struct btrfs_space_info *space_info;
97e728d4 3429 struct btrfs_fs_info *fs_info = extent_root->fs_info;
6324fbf3 3430 u64 thresh;
c146afad
YZ
3431 int ret = 0;
3432
97e728d4 3433 mutex_lock(&fs_info->chunk_mutex);
6324fbf3 3434
2b82032c 3435 flags = btrfs_reduce_alloc_profile(extent_root, flags);
ec44a35c 3436
6324fbf3 3437 space_info = __find_space_info(extent_root->fs_info, flags);
593060d7
CM
3438 if (!space_info) {
3439 ret = update_space_info(extent_root->fs_info, flags,
3440 0, 0, &space_info);
3441 BUG_ON(ret);
3442 }
6324fbf3
CM
3443 BUG_ON(!space_info);
3444
25179201 3445 spin_lock(&space_info->lock);
9ed74f2d 3446 if (space_info->force_alloc)
0ef3e66b 3447 force = 1;
25179201
JB
3448 if (space_info->full) {
3449 spin_unlock(&space_info->lock);
925baedd 3450 goto out;
25179201 3451 }
6324fbf3 3452
c146afad 3453 thresh = space_info->total_bytes - space_info->bytes_readonly;
9ed74f2d 3454 thresh = div_factor(thresh, 8);
0ef3e66b 3455 if (!force &&
e8569813 3456 (space_info->bytes_used + space_info->bytes_pinned +
25179201
JB
3457 space_info->bytes_reserved + alloc_bytes) < thresh) {
3458 spin_unlock(&space_info->lock);
925baedd 3459 goto out;
25179201 3460 }
25179201
JB
3461 spin_unlock(&space_info->lock);
3462
97e728d4
JB
3463 /*
3464 * if we're doing a data chunk, go ahead and make sure that
3465 * we keep a reasonable number of metadata chunks allocated in the
3466 * FS as well.
3467 */
9ed74f2d 3468 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
97e728d4
JB
3469 fs_info->data_chunk_allocations++;
3470 if (!(fs_info->data_chunk_allocations %
3471 fs_info->metadata_ratio))
3472 force_metadata_allocation(fs_info);
3473 }
3474
2b82032c 3475 ret = btrfs_alloc_chunk(trans, extent_root, flags);
9ed74f2d 3476 spin_lock(&space_info->lock);
d397712b 3477 if (ret)
6324fbf3 3478 space_info->full = 1;
9ed74f2d
JB
3479 space_info->force_alloc = 0;
3480 spin_unlock(&space_info->lock);
a74a4b97 3481out:
c146afad 3482 mutex_unlock(&extent_root->fs_info->chunk_mutex);
0f9dd46c 3483 return ret;
6324fbf3
CM
3484}
3485
9078a3e1
CM
3486static int update_block_group(struct btrfs_trans_handle *trans,
3487 struct btrfs_root *root,
db94535d 3488 u64 bytenr, u64 num_bytes, int alloc,
0b86a832 3489 int mark_free)
9078a3e1
CM
3490{
3491 struct btrfs_block_group_cache *cache;
3492 struct btrfs_fs_info *info = root->fs_info;
b742bb82 3493 int factor;
db94535d 3494 u64 total = num_bytes;
9078a3e1 3495 u64 old_val;
db94535d 3496 u64 byte_in_group;
3e1ad54f 3497
5d4f98a2
YZ
3498 /* block accounting for super block */
3499 spin_lock(&info->delalloc_lock);
3500 old_val = btrfs_super_bytes_used(&info->super_copy);
3501 if (alloc)
3502 old_val += num_bytes;
3503 else
3504 old_val -= num_bytes;
3505 btrfs_set_super_bytes_used(&info->super_copy, old_val);
5d4f98a2
YZ
3506 spin_unlock(&info->delalloc_lock);
3507
d397712b 3508 while (total) {
db94535d 3509 cache = btrfs_lookup_block_group(info, bytenr);
f3465ca4 3510 if (!cache)
9078a3e1 3511 return -1;
b742bb82
YZ
3512 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3513 BTRFS_BLOCK_GROUP_RAID1 |
3514 BTRFS_BLOCK_GROUP_RAID10))
3515 factor = 2;
3516 else
3517 factor = 1;
db94535d
CM
3518 byte_in_group = bytenr - cache->key.objectid;
3519 WARN_ON(byte_in_group > cache->key.offset);
9078a3e1 3520
25179201 3521 spin_lock(&cache->space_info->lock);
c286ac48 3522 spin_lock(&cache->lock);
0f9dd46c 3523 cache->dirty = 1;
9078a3e1 3524 old_val = btrfs_block_group_used(&cache->item);
db94535d 3525 num_bytes = min(total, cache->key.offset - byte_in_group);
cd1bc465 3526 if (alloc) {
db94535d 3527 old_val += num_bytes;
11833d66
YZ
3528 btrfs_set_block_group_used(&cache->item, old_val);
3529 cache->reserved -= num_bytes;
11833d66 3530 cache->space_info->bytes_reserved -= num_bytes;
b742bb82
YZ
3531 cache->space_info->bytes_used += num_bytes;
3532 cache->space_info->disk_used += num_bytes * factor;
a512bbf8 3533 if (cache->ro)
c146afad 3534 cache->space_info->bytes_readonly -= num_bytes;
c286ac48 3535 spin_unlock(&cache->lock);
25179201 3536 spin_unlock(&cache->space_info->lock);
cd1bc465 3537 } else {
db94535d 3538 old_val -= num_bytes;
b742bb82 3539 btrfs_set_block_group_used(&cache->item, old_val);
6324fbf3 3540 cache->space_info->bytes_used -= num_bytes;
b742bb82 3541 cache->space_info->disk_used -= num_bytes * factor;
c146afad
YZ
3542 if (cache->ro)
3543 cache->space_info->bytes_readonly += num_bytes;
c286ac48 3544 spin_unlock(&cache->lock);
25179201 3545 spin_unlock(&cache->space_info->lock);
f510cfec 3546 if (mark_free) {
0f9dd46c 3547 int ret;
1f3c79a2
LH
3548
3549 ret = btrfs_discard_extent(root, bytenr,
3550 num_bytes);
3551 WARN_ON(ret);
3552
0f9dd46c
JB
3553 ret = btrfs_add_free_space(cache, bytenr,
3554 num_bytes);
d2fb3437 3555 WARN_ON(ret);
e37c9e69 3556 }
cd1bc465 3557 }
fa9c0d79 3558 btrfs_put_block_group(cache);
db94535d
CM
3559 total -= num_bytes;
3560 bytenr += num_bytes;
9078a3e1
CM
3561 }
3562 return 0;
3563}
6324fbf3 3564
a061fc8d
CM
3565static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3566{
0f9dd46c 3567 struct btrfs_block_group_cache *cache;
d2fb3437 3568 u64 bytenr;
0f9dd46c
JB
3569
3570 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3571 if (!cache)
a061fc8d 3572 return 0;
0f9dd46c 3573
d2fb3437 3574 bytenr = cache->key.objectid;
fa9c0d79 3575 btrfs_put_block_group(cache);
d2fb3437
YZ
3576
3577 return bytenr;
a061fc8d
CM
3578}
3579
11833d66
YZ
3580/*
3581 * this function must be called within transaction
3582 */
3583int btrfs_pin_extent(struct btrfs_root *root,
3584 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 3585{
324ae4df 3586 struct btrfs_fs_info *fs_info = root->fs_info;
11833d66 3587 struct btrfs_block_group_cache *cache;
324ae4df 3588
11833d66
YZ
3589 cache = btrfs_lookup_block_group(fs_info, bytenr);
3590 BUG_ON(!cache);
68b38550 3591
11833d66
YZ
3592 spin_lock(&cache->space_info->lock);
3593 spin_lock(&cache->lock);
3594 cache->pinned += num_bytes;
3595 cache->space_info->bytes_pinned += num_bytes;
3596 if (reserved) {
3597 cache->reserved -= num_bytes;
3598 cache->space_info->bytes_reserved -= num_bytes;
3599 }
3600 spin_unlock(&cache->lock);
3601 spin_unlock(&cache->space_info->lock);
68b38550 3602
11833d66 3603 btrfs_put_block_group(cache);
68b38550 3604
11833d66
YZ
3605 set_extent_dirty(fs_info->pinned_extents,
3606 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3607 return 0;
3608}
3609
3610static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3611 u64 num_bytes, int reserve)
3612{
3613 spin_lock(&cache->space_info->lock);
3614 spin_lock(&cache->lock);
3615 if (reserve) {
3616 cache->reserved += num_bytes;
3617 cache->space_info->bytes_reserved += num_bytes;
3618 } else {
3619 cache->reserved -= num_bytes;
3620 cache->space_info->bytes_reserved -= num_bytes;
324ae4df 3621 }
11833d66
YZ
3622 spin_unlock(&cache->lock);
3623 spin_unlock(&cache->space_info->lock);
324ae4df
Y
3624 return 0;
3625}
9078a3e1 3626
11833d66
YZ
3627int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3628 struct btrfs_root *root)
e8569813 3629{
e8569813 3630 struct btrfs_fs_info *fs_info = root->fs_info;
11833d66
YZ
3631 struct btrfs_caching_control *next;
3632 struct btrfs_caching_control *caching_ctl;
3633 struct btrfs_block_group_cache *cache;
e8569813 3634
11833d66 3635 down_write(&fs_info->extent_commit_sem);
25179201 3636
11833d66
YZ
3637 list_for_each_entry_safe(caching_ctl, next,
3638 &fs_info->caching_block_groups, list) {
3639 cache = caching_ctl->block_group;
3640 if (block_group_cache_done(cache)) {
3641 cache->last_byte_to_unpin = (u64)-1;
3642 list_del_init(&caching_ctl->list);
3643 put_caching_control(caching_ctl);
e8569813 3644 } else {
11833d66 3645 cache->last_byte_to_unpin = caching_ctl->progress;
e8569813 3646 }
e8569813 3647 }
11833d66
YZ
3648
3649 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3650 fs_info->pinned_extents = &fs_info->freed_extents[1];
3651 else
3652 fs_info->pinned_extents = &fs_info->freed_extents[0];
3653
3654 up_write(&fs_info->extent_commit_sem);
e8569813
ZY
3655 return 0;
3656}
3657
11833d66 3658static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
ccd467d6 3659{
11833d66
YZ
3660 struct btrfs_fs_info *fs_info = root->fs_info;
3661 struct btrfs_block_group_cache *cache = NULL;
3662 u64 len;
ccd467d6 3663
11833d66
YZ
3664 while (start <= end) {
3665 if (!cache ||
3666 start >= cache->key.objectid + cache->key.offset) {
3667 if (cache)
3668 btrfs_put_block_group(cache);
3669 cache = btrfs_lookup_block_group(fs_info, start);
3670 BUG_ON(!cache);
3671 }
3672
3673 len = cache->key.objectid + cache->key.offset - start;
3674 len = min(len, end + 1 - start);
3675
3676 if (start < cache->last_byte_to_unpin) {
3677 len = min(len, cache->last_byte_to_unpin - start);
3678 btrfs_add_free_space(cache, start, len);
3679 }
3680
3681 spin_lock(&cache->space_info->lock);
3682 spin_lock(&cache->lock);
3683 cache->pinned -= len;
3684 cache->space_info->bytes_pinned -= len;
3685 spin_unlock(&cache->lock);
3686 spin_unlock(&cache->space_info->lock);
817d52f8 3687
11833d66 3688 start += len;
ccd467d6 3689 }
11833d66
YZ
3690
3691 if (cache)
3692 btrfs_put_block_group(cache);
ccd467d6
CM
3693 return 0;
3694}
3695
3696int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
11833d66 3697 struct btrfs_root *root)
a28ec197 3698{
11833d66
YZ
3699 struct btrfs_fs_info *fs_info = root->fs_info;
3700 struct extent_io_tree *unpin;
1a5bc167
CM
3701 u64 start;
3702 u64 end;
a28ec197 3703 int ret;
a28ec197 3704
11833d66
YZ
3705 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3706 unpin = &fs_info->freed_extents[1];
3707 else
3708 unpin = &fs_info->freed_extents[0];
3709
d397712b 3710 while (1) {
1a5bc167
CM
3711 ret = find_first_extent_bit(unpin, 0, &start, &end,
3712 EXTENT_DIRTY);
3713 if (ret)
a28ec197 3714 break;
1f3c79a2
LH
3715
3716 ret = btrfs_discard_extent(root, start, end + 1 - start);
3717
1a5bc167 3718 clear_extent_dirty(unpin, start, end, GFP_NOFS);
11833d66 3719 unpin_extent_range(root, start, end);
b9473439 3720 cond_resched();
a28ec197 3721 }
817d52f8 3722
1f3c79a2 3723 return ret;
a28ec197
CM
3724}
3725
31840ae1
ZY
3726static int pin_down_bytes(struct btrfs_trans_handle *trans,
3727 struct btrfs_root *root,
b9473439 3728 struct btrfs_path *path,
11833d66
YZ
3729 u64 bytenr, u64 num_bytes,
3730 int is_data, int reserved,
b9473439 3731 struct extent_buffer **must_clean)
e20d96d6 3732{
1a5bc167 3733 int err = 0;
31840ae1 3734 struct extent_buffer *buf;
8ef97622 3735
31840ae1
ZY
3736 if (is_data)
3737 goto pinit;
3738
444528b3
CM
3739 /*
3740 * discard is sloooow, and so triggering discards on
3741 * individual btree blocks isn't a good plan. Just
3742 * pin everything in discard mode.
3743 */
3744 if (btrfs_test_opt(root, DISCARD))
3745 goto pinit;
3746
31840ae1
ZY
3747 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3748 if (!buf)
3749 goto pinit;
3750
3751 /* we can reuse a block if it hasn't been written
3752 * and it is from this transaction. We can't
3753 * reuse anything from the tree log root because
3754 * it has tiny sub-transactions.
3755 */
3756 if (btrfs_buffer_uptodate(buf, 0) &&
3757 btrfs_try_tree_lock(buf)) {
3758 u64 header_owner = btrfs_header_owner(buf);
3759 u64 header_transid = btrfs_header_generation(buf);
3760 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3761 header_transid == trans->transid &&
3762 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
b9473439 3763 *must_clean = buf;
31840ae1 3764 return 1;
8ef97622 3765 }
31840ae1 3766 btrfs_tree_unlock(buf);
f4b9aa8d 3767 }
31840ae1
ZY
3768 free_extent_buffer(buf);
3769pinit:
11833d66
YZ
3770 if (path)
3771 btrfs_set_path_blocking(path);
b9473439 3772 /* unlocks the pinned mutex */
11833d66 3773 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
31840ae1 3774
be744175 3775 BUG_ON(err < 0);
e20d96d6
CM
3776 return 0;
3777}
3778
5d4f98a2
YZ
3779static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3780 struct btrfs_root *root,
3781 u64 bytenr, u64 num_bytes, u64 parent,
3782 u64 root_objectid, u64 owner_objectid,
3783 u64 owner_offset, int refs_to_drop,
3784 struct btrfs_delayed_extent_op *extent_op)
a28ec197 3785{
e2fa7227 3786 struct btrfs_key key;
5d4f98a2 3787 struct btrfs_path *path;
1261ec42
CM
3788 struct btrfs_fs_info *info = root->fs_info;
3789 struct btrfs_root *extent_root = info->extent_root;
5f39d397 3790 struct extent_buffer *leaf;
5d4f98a2
YZ
3791 struct btrfs_extent_item *ei;
3792 struct btrfs_extent_inline_ref *iref;
a28ec197 3793 int ret;
5d4f98a2 3794 int is_data;
952fccac
CM
3795 int extent_slot = 0;
3796 int found_extent = 0;
3797 int num_to_del = 1;
5d4f98a2
YZ
3798 u32 item_size;
3799 u64 refs;
037e6390 3800
5caf2a00 3801 path = btrfs_alloc_path();
54aa1f4d
CM
3802 if (!path)
3803 return -ENOMEM;
5f26f772 3804
3c12ac72 3805 path->reada = 1;
b9473439 3806 path->leave_spinning = 1;
5d4f98a2
YZ
3807
3808 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3809 BUG_ON(!is_data && refs_to_drop != 1);
3810
3811 ret = lookup_extent_backref(trans, extent_root, path, &iref,
3812 bytenr, num_bytes, parent,
3813 root_objectid, owner_objectid,
3814 owner_offset);
7bb86316 3815 if (ret == 0) {
952fccac 3816 extent_slot = path->slots[0];
5d4f98a2
YZ
3817 while (extent_slot >= 0) {
3818 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 3819 extent_slot);
5d4f98a2 3820 if (key.objectid != bytenr)
952fccac 3821 break;
5d4f98a2
YZ
3822 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3823 key.offset == num_bytes) {
952fccac
CM
3824 found_extent = 1;
3825 break;
3826 }
3827 if (path->slots[0] - extent_slot > 5)
3828 break;
5d4f98a2 3829 extent_slot--;
952fccac 3830 }
5d4f98a2
YZ
3831#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3832 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3833 if (found_extent && item_size < sizeof(*ei))
3834 found_extent = 0;
3835#endif
31840ae1 3836 if (!found_extent) {
5d4f98a2 3837 BUG_ON(iref);
56bec294 3838 ret = remove_extent_backref(trans, extent_root, path,
5d4f98a2
YZ
3839 NULL, refs_to_drop,
3840 is_data);
31840ae1
ZY
3841 BUG_ON(ret);
3842 btrfs_release_path(extent_root, path);
b9473439 3843 path->leave_spinning = 1;
5d4f98a2
YZ
3844
3845 key.objectid = bytenr;
3846 key.type = BTRFS_EXTENT_ITEM_KEY;
3847 key.offset = num_bytes;
3848
31840ae1
ZY
3849 ret = btrfs_search_slot(trans, extent_root,
3850 &key, path, -1, 1);
f3465ca4
JB
3851 if (ret) {
3852 printk(KERN_ERR "umm, got %d back from search"
d397712b
CM
3853 ", was looking for %llu\n", ret,
3854 (unsigned long long)bytenr);
f3465ca4
JB
3855 btrfs_print_leaf(extent_root, path->nodes[0]);
3856 }
31840ae1
ZY
3857 BUG_ON(ret);
3858 extent_slot = path->slots[0];
3859 }
7bb86316
CM
3860 } else {
3861 btrfs_print_leaf(extent_root, path->nodes[0]);
3862 WARN_ON(1);
d397712b 3863 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5d4f98a2 3864 "parent %llu root %llu owner %llu offset %llu\n",
d397712b 3865 (unsigned long long)bytenr,
56bec294 3866 (unsigned long long)parent,
d397712b 3867 (unsigned long long)root_objectid,
5d4f98a2
YZ
3868 (unsigned long long)owner_objectid,
3869 (unsigned long long)owner_offset);
7bb86316 3870 }
5f39d397
CM
3871
3872 leaf = path->nodes[0];
5d4f98a2
YZ
3873 item_size = btrfs_item_size_nr(leaf, extent_slot);
3874#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3875 if (item_size < sizeof(*ei)) {
3876 BUG_ON(found_extent || extent_slot != path->slots[0]);
3877 ret = convert_extent_item_v0(trans, extent_root, path,
3878 owner_objectid, 0);
3879 BUG_ON(ret < 0);
3880
3881 btrfs_release_path(extent_root, path);
3882 path->leave_spinning = 1;
3883
3884 key.objectid = bytenr;
3885 key.type = BTRFS_EXTENT_ITEM_KEY;
3886 key.offset = num_bytes;
3887
3888 ret = btrfs_search_slot(trans, extent_root, &key, path,
3889 -1, 1);
3890 if (ret) {
3891 printk(KERN_ERR "umm, got %d back from search"
3892 ", was looking for %llu\n", ret,
3893 (unsigned long long)bytenr);
3894 btrfs_print_leaf(extent_root, path->nodes[0]);
3895 }
3896 BUG_ON(ret);
3897 extent_slot = path->slots[0];
3898 leaf = path->nodes[0];
3899 item_size = btrfs_item_size_nr(leaf, extent_slot);
3900 }
3901#endif
3902 BUG_ON(item_size < sizeof(*ei));
952fccac 3903 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 3904 struct btrfs_extent_item);
5d4f98a2
YZ
3905 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3906 struct btrfs_tree_block_info *bi;
3907 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3908 bi = (struct btrfs_tree_block_info *)(ei + 1);
3909 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3910 }
56bec294 3911
5d4f98a2 3912 refs = btrfs_extent_refs(leaf, ei);
56bec294
CM
3913 BUG_ON(refs < refs_to_drop);
3914 refs -= refs_to_drop;
5f39d397 3915
5d4f98a2
YZ
3916 if (refs > 0) {
3917 if (extent_op)
3918 __run_delayed_extent_op(extent_op, leaf, ei);
3919 /*
3920 * In the case of inline back ref, reference count will
3921 * be updated by remove_extent_backref
952fccac 3922 */
5d4f98a2
YZ
3923 if (iref) {
3924 BUG_ON(!found_extent);
3925 } else {
3926 btrfs_set_extent_refs(leaf, ei, refs);
3927 btrfs_mark_buffer_dirty(leaf);
3928 }
3929 if (found_extent) {
3930 ret = remove_extent_backref(trans, extent_root, path,
3931 iref, refs_to_drop,
3932 is_data);
952fccac
CM
3933 BUG_ON(ret);
3934 }
5d4f98a2
YZ
3935 } else {
3936 int mark_free = 0;
b9473439 3937 struct extent_buffer *must_clean = NULL;
78fae27e 3938
5d4f98a2
YZ
3939 if (found_extent) {
3940 BUG_ON(is_data && refs_to_drop !=
3941 extent_data_ref_count(root, path, iref));
3942 if (iref) {
3943 BUG_ON(path->slots[0] != extent_slot);
3944 } else {
3945 BUG_ON(path->slots[0] != extent_slot + 1);
3946 path->slots[0] = extent_slot;
3947 num_to_del = 2;
3948 }
78fae27e 3949 }
b9473439 3950
5d4f98a2 3951 ret = pin_down_bytes(trans, root, path, bytenr,
11833d66 3952 num_bytes, is_data, 0, &must_clean);
5d4f98a2
YZ
3953 if (ret > 0)
3954 mark_free = 1;
3955 BUG_ON(ret < 0);
b9473439
CM
3956 /*
3957 * it is going to be very rare for someone to be waiting
3958 * on the block we're freeing. del_items might need to
3959 * schedule, so rather than get fancy, just force it
3960 * to blocking here
3961 */
3962 if (must_clean)
3963 btrfs_set_lock_blocking(must_clean);
3964
952fccac
CM
3965 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3966 num_to_del);
31840ae1 3967 BUG_ON(ret);
25179201 3968 btrfs_release_path(extent_root, path);
21af804c 3969
b9473439
CM
3970 if (must_clean) {
3971 clean_tree_block(NULL, root, must_clean);
3972 btrfs_tree_unlock(must_clean);
3973 free_extent_buffer(must_clean);
3974 }
3975
5d4f98a2 3976 if (is_data) {
459931ec
CM
3977 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3978 BUG_ON(ret);
d57e62b8
CM
3979 } else {
3980 invalidate_mapping_pages(info->btree_inode->i_mapping,
3981 bytenr >> PAGE_CACHE_SHIFT,
3982 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
459931ec
CM
3983 }
3984
dcbdd4dc
CM
3985 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3986 mark_free);
3987 BUG_ON(ret);
a28ec197 3988 }
5caf2a00 3989 btrfs_free_path(path);
a28ec197
CM
3990 return ret;
3991}
3992
1887be66
CM
3993/*
3994 * when we free an extent, it is possible (and likely) that we free the last
3995 * delayed ref for that extent as well. This searches the delayed ref tree for
3996 * a given extent, and if there are no other delayed refs to be processed, it
3997 * removes it from the tree.
3998 */
3999static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4000 struct btrfs_root *root, u64 bytenr)
4001{
4002 struct btrfs_delayed_ref_head *head;
4003 struct btrfs_delayed_ref_root *delayed_refs;
4004 struct btrfs_delayed_ref_node *ref;
4005 struct rb_node *node;
4006 int ret;
4007
4008 delayed_refs = &trans->transaction->delayed_refs;
4009 spin_lock(&delayed_refs->lock);
4010 head = btrfs_find_delayed_ref_head(trans, bytenr);
4011 if (!head)
4012 goto out;
4013
4014 node = rb_prev(&head->node.rb_node);
4015 if (!node)
4016 goto out;
4017
4018 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4019
4020 /* there are still entries for this ref, we can't drop it */
4021 if (ref->bytenr == bytenr)
4022 goto out;
4023
5d4f98a2
YZ
4024 if (head->extent_op) {
4025 if (!head->must_insert_reserved)
4026 goto out;
4027 kfree(head->extent_op);
4028 head->extent_op = NULL;
4029 }
4030
1887be66
CM
4031 /*
4032 * waiting for the lock here would deadlock. If someone else has it
4033 * locked they are already in the process of dropping it anyway
4034 */
4035 if (!mutex_trylock(&head->mutex))
4036 goto out;
4037
4038 /*
4039 * at this point we have a head with no other entries. Go
4040 * ahead and process it.
4041 */
4042 head->node.in_tree = 0;
4043 rb_erase(&head->node.rb_node, &delayed_refs->root);
c3e69d58 4044
1887be66
CM
4045 delayed_refs->num_entries--;
4046
4047 /*
4048 * we don't take a ref on the node because we're removing it from the
4049 * tree, so we just steal the ref the tree was holding.
4050 */
c3e69d58
CM
4051 delayed_refs->num_heads--;
4052 if (list_empty(&head->cluster))
4053 delayed_refs->num_heads_ready--;
4054
4055 list_del_init(&head->cluster);
1887be66
CM
4056 spin_unlock(&delayed_refs->lock);
4057
4058 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
5d4f98a2
YZ
4059 &head->node, head->extent_op,
4060 head->must_insert_reserved);
1887be66
CM
4061 BUG_ON(ret);
4062 btrfs_put_delayed_ref(&head->node);
4063 return 0;
4064out:
4065 spin_unlock(&delayed_refs->lock);
4066 return 0;
4067}
4068
925baedd 4069int btrfs_free_extent(struct btrfs_trans_handle *trans,
31840ae1
ZY
4070 struct btrfs_root *root,
4071 u64 bytenr, u64 num_bytes, u64 parent,
5d4f98a2 4072 u64 root_objectid, u64 owner, u64 offset)
925baedd
CM
4073{
4074 int ret;
4075
56bec294
CM
4076 /*
4077 * tree log blocks never actually go into the extent allocation
4078 * tree, just update pinning info and exit early.
56bec294 4079 */
5d4f98a2
YZ
4080 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4081 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
b9473439 4082 /* unlocks the pinned mutex */
11833d66 4083 btrfs_pin_extent(root, bytenr, num_bytes, 1);
56bec294 4084 ret = 0;
5d4f98a2
YZ
4085 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4086 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4087 parent, root_objectid, (int)owner,
4088 BTRFS_DROP_DELAYED_REF, NULL);
1887be66
CM
4089 BUG_ON(ret);
4090 ret = check_ref_cleanup(trans, root, bytenr);
4091 BUG_ON(ret);
5d4f98a2
YZ
4092 } else {
4093 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4094 parent, root_objectid, owner,
4095 offset, BTRFS_DROP_DELAYED_REF, NULL);
4096 BUG_ON(ret);
56bec294 4097 }
925baedd
CM
4098 return ret;
4099}
4100
86b9f2ec
YZ
4101int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4102 struct btrfs_root *root,
4103 u64 bytenr, u32 blocksize,
4104 u64 parent, u64 root_objectid, int level)
4105{
4106 u64 used;
4107 spin_lock(&root->node_lock);
4108 used = btrfs_root_used(&root->root_item) - blocksize;
4109 btrfs_set_root_used(&root->root_item, used);
4110 spin_unlock(&root->node_lock);
4111
4112 return btrfs_free_extent(trans, root, bytenr, blocksize,
4113 parent, root_objectid, level, 0);
4114}
4115
87ee04eb
CM
4116static u64 stripe_align(struct btrfs_root *root, u64 val)
4117{
4118 u64 mask = ((u64)root->stripesize - 1);
4119 u64 ret = (val + mask) & ~mask;
4120 return ret;
4121}
4122
817d52f8
JB
4123/*
4124 * when we wait for progress in the block group caching, its because
4125 * our allocation attempt failed at least once. So, we must sleep
4126 * and let some progress happen before we try again.
4127 *
4128 * This function will sleep at least once waiting for new free space to
4129 * show up, and then it will check the block group free space numbers
4130 * for our min num_bytes. Another option is to have it go ahead
4131 * and look in the rbtree for a free extent of a given size, but this
4132 * is a good start.
4133 */
4134static noinline int
4135wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4136 u64 num_bytes)
4137{
11833d66 4138 struct btrfs_caching_control *caching_ctl;
817d52f8
JB
4139 DEFINE_WAIT(wait);
4140
11833d66
YZ
4141 caching_ctl = get_caching_control(cache);
4142 if (!caching_ctl)
817d52f8 4143 return 0;
817d52f8 4144
11833d66 4145 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
817d52f8 4146 (cache->free_space >= num_bytes));
11833d66
YZ
4147
4148 put_caching_control(caching_ctl);
4149 return 0;
4150}
4151
4152static noinline int
4153wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4154{
4155 struct btrfs_caching_control *caching_ctl;
4156 DEFINE_WAIT(wait);
4157
4158 caching_ctl = get_caching_control(cache);
4159 if (!caching_ctl)
4160 return 0;
4161
4162 wait_event(caching_ctl->wait, block_group_cache_done(cache));
4163
4164 put_caching_control(caching_ctl);
817d52f8
JB
4165 return 0;
4166}
4167
b742bb82
YZ
4168static int get_block_group_index(struct btrfs_block_group_cache *cache)
4169{
4170 int index;
4171 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4172 index = 0;
4173 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4174 index = 1;
4175 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4176 index = 2;
4177 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4178 index = 3;
4179 else
4180 index = 4;
4181 return index;
4182}
4183
817d52f8 4184enum btrfs_loop_type {
ccf0e725 4185 LOOP_FIND_IDEAL = 0,
817d52f8
JB
4186 LOOP_CACHING_NOWAIT = 1,
4187 LOOP_CACHING_WAIT = 2,
4188 LOOP_ALLOC_CHUNK = 3,
4189 LOOP_NO_EMPTY_SIZE = 4,
4190};
4191
fec577fb
CM
4192/*
4193 * walks the btree of allocated extents and find a hole of a given size.
4194 * The key ins is changed to record the hole:
4195 * ins->objectid == block start
62e2749e 4196 * ins->flags = BTRFS_EXTENT_ITEM_KEY
fec577fb
CM
4197 * ins->offset == number of blocks
4198 * Any available blocks before search_start are skipped.
4199 */
d397712b 4200static noinline int find_free_extent(struct btrfs_trans_handle *trans,
98ed5174
CM
4201 struct btrfs_root *orig_root,
4202 u64 num_bytes, u64 empty_size,
4203 u64 search_start, u64 search_end,
4204 u64 hint_byte, struct btrfs_key *ins,
4205 u64 exclude_start, u64 exclude_nr,
4206 int data)
fec577fb 4207{
80eb234a 4208 int ret = 0;
d397712b 4209 struct btrfs_root *root = orig_root->fs_info->extent_root;
fa9c0d79 4210 struct btrfs_free_cluster *last_ptr = NULL;
80eb234a 4211 struct btrfs_block_group_cache *block_group = NULL;
239b14b3 4212 int empty_cluster = 2 * 1024 * 1024;
0ef3e66b 4213 int allowed_chunk_alloc = 0;
ccf0e725 4214 int done_chunk_alloc = 0;
80eb234a 4215 struct btrfs_space_info *space_info;
fa9c0d79 4216 int last_ptr_loop = 0;
b742bb82 4217 int index = 0;
fa9c0d79 4218 int loop = 0;
817d52f8 4219 bool found_uncached_bg = false;
0a24325e 4220 bool failed_cluster_refill = false;
1cdda9b8 4221 bool failed_alloc = false;
ccf0e725
JB
4222 u64 ideal_cache_percent = 0;
4223 u64 ideal_cache_offset = 0;
fec577fb 4224
db94535d 4225 WARN_ON(num_bytes < root->sectorsize);
b1a4d965 4226 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
80eb234a
JB
4227 ins->objectid = 0;
4228 ins->offset = 0;
b1a4d965 4229
2552d17e 4230 space_info = __find_space_info(root->fs_info, data);
1b1d1f66
JB
4231 if (!space_info) {
4232 printk(KERN_ERR "No space info for %d\n", data);
4233 return -ENOSPC;
4234 }
2552d17e 4235
0ef3e66b
CM
4236 if (orig_root->ref_cows || empty_size)
4237 allowed_chunk_alloc = 1;
4238
239b14b3 4239 if (data & BTRFS_BLOCK_GROUP_METADATA) {
fa9c0d79 4240 last_ptr = &root->fs_info->meta_alloc_cluster;
536ac8ae
CM
4241 if (!btrfs_test_opt(root, SSD))
4242 empty_cluster = 64 * 1024;
239b14b3
CM
4243 }
4244
fa9c0d79
CM
4245 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4246 last_ptr = &root->fs_info->data_alloc_cluster;
4247 }
0f9dd46c 4248
239b14b3 4249 if (last_ptr) {
fa9c0d79
CM
4250 spin_lock(&last_ptr->lock);
4251 if (last_ptr->block_group)
4252 hint_byte = last_ptr->window_start;
4253 spin_unlock(&last_ptr->lock);
239b14b3 4254 }
fa9c0d79 4255
a061fc8d 4256 search_start = max(search_start, first_logical_byte(root, 0));
239b14b3 4257 search_start = max(search_start, hint_byte);
0b86a832 4258
817d52f8 4259 if (!last_ptr)
fa9c0d79 4260 empty_cluster = 0;
fa9c0d79 4261
2552d17e 4262 if (search_start == hint_byte) {
ccf0e725 4263ideal_cache:
2552d17e
JB
4264 block_group = btrfs_lookup_block_group(root->fs_info,
4265 search_start);
817d52f8
JB
4266 /*
4267 * we don't want to use the block group if it doesn't match our
4268 * allocation bits, or if its not cached.
ccf0e725
JB
4269 *
4270 * However if we are re-searching with an ideal block group
4271 * picked out then we don't care that the block group is cached.
817d52f8
JB
4272 */
4273 if (block_group && block_group_bits(block_group, data) &&
ccf0e725
JB
4274 (block_group->cached != BTRFS_CACHE_NO ||
4275 search_start == ideal_cache_offset)) {
2552d17e 4276 down_read(&space_info->groups_sem);
44fb5511
CM
4277 if (list_empty(&block_group->list) ||
4278 block_group->ro) {
4279 /*
4280 * someone is removing this block group,
4281 * we can't jump into the have_block_group
4282 * target because our list pointers are not
4283 * valid
4284 */
4285 btrfs_put_block_group(block_group);
4286 up_read(&space_info->groups_sem);
ccf0e725 4287 } else {
b742bb82 4288 index = get_block_group_index(block_group);
44fb5511 4289 goto have_block_group;
ccf0e725 4290 }
2552d17e 4291 } else if (block_group) {
fa9c0d79 4292 btrfs_put_block_group(block_group);
2552d17e 4293 }
42e70e7a 4294 }
2552d17e 4295search:
80eb234a 4296 down_read(&space_info->groups_sem);
b742bb82
YZ
4297 list_for_each_entry(block_group, &space_info->block_groups[index],
4298 list) {
6226cb0a 4299 u64 offset;
817d52f8 4300 int cached;
8a1413a2 4301
11dfe35a 4302 btrfs_get_block_group(block_group);
2552d17e 4303 search_start = block_group->key.objectid;
42e70e7a 4304
2552d17e 4305have_block_group:
817d52f8 4306 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
ccf0e725
JB
4307 u64 free_percent;
4308
4309 free_percent = btrfs_block_group_used(&block_group->item);
4310 free_percent *= 100;
4311 free_percent = div64_u64(free_percent,
4312 block_group->key.offset);
4313 free_percent = 100 - free_percent;
4314 if (free_percent > ideal_cache_percent &&
4315 likely(!block_group->ro)) {
4316 ideal_cache_offset = block_group->key.objectid;
4317 ideal_cache_percent = free_percent;
4318 }
4319
817d52f8 4320 /*
ccf0e725
JB
4321 * We only want to start kthread caching if we are at
4322 * the point where we will wait for caching to make
4323 * progress, or if our ideal search is over and we've
4324 * found somebody to start caching.
817d52f8
JB
4325 */
4326 if (loop > LOOP_CACHING_NOWAIT ||
ccf0e725
JB
4327 (loop > LOOP_FIND_IDEAL &&
4328 atomic_read(&space_info->caching_threads) < 2)) {
817d52f8
JB
4329 ret = cache_block_group(block_group);
4330 BUG_ON(ret);
2552d17e 4331 }
817d52f8
JB
4332 found_uncached_bg = true;
4333
ccf0e725
JB
4334 /*
4335 * If loop is set for cached only, try the next block
4336 * group.
4337 */
4338 if (loop == LOOP_FIND_IDEAL)
817d52f8
JB
4339 goto loop;
4340 }
4341
ccf0e725
JB
4342 cached = block_group_cache_done(block_group);
4343 if (unlikely(!cached))
4344 found_uncached_bg = true;
4345
ea6a478e 4346 if (unlikely(block_group->ro))
2552d17e 4347 goto loop;
0f9dd46c 4348
0a24325e
JB
4349 /*
4350 * Ok we want to try and use the cluster allocator, so lets look
4351 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4352 * have tried the cluster allocator plenty of times at this
4353 * point and not have found anything, so we are likely way too
4354 * fragmented for the clustering stuff to find anything, so lets
4355 * just skip it and let the allocator find whatever block it can
4356 * find
4357 */
4358 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
fa9c0d79
CM
4359 /*
4360 * the refill lock keeps out other
4361 * people trying to start a new cluster
4362 */
4363 spin_lock(&last_ptr->refill_lock);
44fb5511
CM
4364 if (last_ptr->block_group &&
4365 (last_ptr->block_group->ro ||
4366 !block_group_bits(last_ptr->block_group, data))) {
4367 offset = 0;
4368 goto refill_cluster;
4369 }
4370
fa9c0d79
CM
4371 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4372 num_bytes, search_start);
4373 if (offset) {
4374 /* we have a block, we're done */
4375 spin_unlock(&last_ptr->refill_lock);
4376 goto checks;
4377 }
4378
4379 spin_lock(&last_ptr->lock);
4380 /*
4381 * whoops, this cluster doesn't actually point to
4382 * this block group. Get a ref on the block
4383 * group is does point to and try again
4384 */
4385 if (!last_ptr_loop && last_ptr->block_group &&
4386 last_ptr->block_group != block_group) {
4387
4388 btrfs_put_block_group(block_group);
4389 block_group = last_ptr->block_group;
11dfe35a 4390 btrfs_get_block_group(block_group);
fa9c0d79
CM
4391 spin_unlock(&last_ptr->lock);
4392 spin_unlock(&last_ptr->refill_lock);
4393
4394 last_ptr_loop = 1;
4395 search_start = block_group->key.objectid;
44fb5511
CM
4396 /*
4397 * we know this block group is properly
4398 * in the list because
4399 * btrfs_remove_block_group, drops the
4400 * cluster before it removes the block
4401 * group from the list
4402 */
fa9c0d79
CM
4403 goto have_block_group;
4404 }
4405 spin_unlock(&last_ptr->lock);
44fb5511 4406refill_cluster:
fa9c0d79
CM
4407 /*
4408 * this cluster didn't work out, free it and
4409 * start over
4410 */
4411 btrfs_return_cluster_to_free_space(NULL, last_ptr);
4412
4413 last_ptr_loop = 0;
4414
4415 /* allocate a cluster in this block group */
451d7585 4416 ret = btrfs_find_space_cluster(trans, root,
fa9c0d79
CM
4417 block_group, last_ptr,
4418 offset, num_bytes,
4419 empty_cluster + empty_size);
4420 if (ret == 0) {
4421 /*
4422 * now pull our allocation out of this
4423 * cluster
4424 */
4425 offset = btrfs_alloc_from_cluster(block_group,
4426 last_ptr, num_bytes,
4427 search_start);
4428 if (offset) {
4429 /* we found one, proceed */
4430 spin_unlock(&last_ptr->refill_lock);
4431 goto checks;
4432 }
0a24325e
JB
4433 } else if (!cached && loop > LOOP_CACHING_NOWAIT
4434 && !failed_cluster_refill) {
817d52f8
JB
4435 spin_unlock(&last_ptr->refill_lock);
4436
0a24325e 4437 failed_cluster_refill = true;
817d52f8
JB
4438 wait_block_group_cache_progress(block_group,
4439 num_bytes + empty_cluster + empty_size);
4440 goto have_block_group;
fa9c0d79 4441 }
817d52f8 4442
fa9c0d79
CM
4443 /*
4444 * at this point we either didn't find a cluster
4445 * or we weren't able to allocate a block from our
4446 * cluster. Free the cluster we've been trying
4447 * to use, and go to the next block group
4448 */
0a24325e 4449 btrfs_return_cluster_to_free_space(NULL, last_ptr);
fa9c0d79 4450 spin_unlock(&last_ptr->refill_lock);
0a24325e 4451 goto loop;
fa9c0d79
CM
4452 }
4453
6226cb0a
JB
4454 offset = btrfs_find_space_for_alloc(block_group, search_start,
4455 num_bytes, empty_size);
1cdda9b8
JB
4456 /*
4457 * If we didn't find a chunk, and we haven't failed on this
4458 * block group before, and this block group is in the middle of
4459 * caching and we are ok with waiting, then go ahead and wait
4460 * for progress to be made, and set failed_alloc to true.
4461 *
4462 * If failed_alloc is true then we've already waited on this
4463 * block group once and should move on to the next block group.
4464 */
4465 if (!offset && !failed_alloc && !cached &&
4466 loop > LOOP_CACHING_NOWAIT) {
817d52f8 4467 wait_block_group_cache_progress(block_group,
1cdda9b8
JB
4468 num_bytes + empty_size);
4469 failed_alloc = true;
817d52f8 4470 goto have_block_group;
1cdda9b8
JB
4471 } else if (!offset) {
4472 goto loop;
817d52f8 4473 }
fa9c0d79 4474checks:
6226cb0a 4475 search_start = stripe_align(root, offset);
2552d17e 4476 /* move on to the next group */
6226cb0a
JB
4477 if (search_start + num_bytes >= search_end) {
4478 btrfs_add_free_space(block_group, offset, num_bytes);
2552d17e 4479 goto loop;
6226cb0a 4480 }
25179201 4481
2552d17e
JB
4482 /* move on to the next group */
4483 if (search_start + num_bytes >
6226cb0a
JB
4484 block_group->key.objectid + block_group->key.offset) {
4485 btrfs_add_free_space(block_group, offset, num_bytes);
2552d17e 4486 goto loop;
6226cb0a 4487 }
f5a31e16 4488
2552d17e
JB
4489 if (exclude_nr > 0 &&
4490 (search_start + num_bytes > exclude_start &&
4491 search_start < exclude_start + exclude_nr)) {
4492 search_start = exclude_start + exclude_nr;
4493
6226cb0a 4494 btrfs_add_free_space(block_group, offset, num_bytes);
2552d17e
JB
4495 /*
4496 * if search_start is still in this block group
4497 * then we just re-search this block group
f5a31e16 4498 */
2552d17e
JB
4499 if (search_start >= block_group->key.objectid &&
4500 search_start < (block_group->key.objectid +
6226cb0a 4501 block_group->key.offset))
2552d17e 4502 goto have_block_group;
2552d17e 4503 goto loop;
0f9dd46c 4504 }
0b86a832 4505
2552d17e
JB
4506 ins->objectid = search_start;
4507 ins->offset = num_bytes;
d2fb3437 4508
6226cb0a
JB
4509 if (offset < search_start)
4510 btrfs_add_free_space(block_group, offset,
4511 search_start - offset);
4512 BUG_ON(offset > search_start);
4513
11833d66
YZ
4514 update_reserved_extents(block_group, num_bytes, 1);
4515
2552d17e 4516 /* we are all good, lets return */
2552d17e
JB
4517 break;
4518loop:
0a24325e 4519 failed_cluster_refill = false;
1cdda9b8 4520 failed_alloc = false;
b742bb82 4521 BUG_ON(index != get_block_group_index(block_group));
fa9c0d79 4522 btrfs_put_block_group(block_group);
2552d17e
JB
4523 }
4524 up_read(&space_info->groups_sem);
4525
b742bb82
YZ
4526 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4527 goto search;
4528
ccf0e725
JB
4529 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4530 * for them to make caching progress. Also
4531 * determine the best possible bg to cache
4532 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4533 * caching kthreads as we move along
817d52f8
JB
4534 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4535 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4536 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4537 * again
fa9c0d79 4538 */
817d52f8
JB
4539 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4540 (found_uncached_bg || empty_size || empty_cluster ||
4541 allowed_chunk_alloc)) {
b742bb82 4542 index = 0;
ccf0e725 4543 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
817d52f8 4544 found_uncached_bg = false;
ccf0e725
JB
4545 loop++;
4546 if (!ideal_cache_percent &&
4547 atomic_read(&space_info->caching_threads))
817d52f8 4548 goto search;
ccf0e725
JB
4549
4550 /*
4551 * 1 of the following 2 things have happened so far
4552 *
4553 * 1) We found an ideal block group for caching that
4554 * is mostly full and will cache quickly, so we might
4555 * as well wait for it.
4556 *
4557 * 2) We searched for cached only and we didn't find
4558 * anything, and we didn't start any caching kthreads
4559 * either, so chances are we will loop through and
4560 * start a couple caching kthreads, and then come back
4561 * around and just wait for them. This will be slower
4562 * because we will have 2 caching kthreads reading at
4563 * the same time when we could have just started one
4564 * and waited for it to get far enough to give us an
4565 * allocation, so go ahead and go to the wait caching
4566 * loop.
4567 */
4568 loop = LOOP_CACHING_WAIT;
4569 search_start = ideal_cache_offset;
4570 ideal_cache_percent = 0;
4571 goto ideal_cache;
4572 } else if (loop == LOOP_FIND_IDEAL) {
4573 /*
4574 * Didn't find a uncached bg, wait on anything we find
4575 * next.
4576 */
4577 loop = LOOP_CACHING_WAIT;
4578 goto search;
4579 }
4580
4581 if (loop < LOOP_CACHING_WAIT) {
4582 loop++;
4583 goto search;
817d52f8
JB
4584 }
4585
4586 if (loop == LOOP_ALLOC_CHUNK) {
fa9c0d79
CM
4587 empty_size = 0;
4588 empty_cluster = 0;
4589 }
2552d17e
JB
4590
4591 if (allowed_chunk_alloc) {
4592 ret = do_chunk_alloc(trans, root, num_bytes +
4593 2 * 1024 * 1024, data, 1);
2552d17e 4594 allowed_chunk_alloc = 0;
ccf0e725
JB
4595 done_chunk_alloc = 1;
4596 } else if (!done_chunk_alloc) {
2552d17e
JB
4597 space_info->force_alloc = 1;
4598 }
4599
817d52f8 4600 if (loop < LOOP_NO_EMPTY_SIZE) {
fa9c0d79 4601 loop++;
2552d17e 4602 goto search;
fa9c0d79 4603 }
2552d17e
JB
4604 ret = -ENOSPC;
4605 } else if (!ins->objectid) {
4606 ret = -ENOSPC;
f2654de4 4607 }
0b86a832 4608
80eb234a
JB
4609 /* we found what we needed */
4610 if (ins->objectid) {
4611 if (!(data & BTRFS_BLOCK_GROUP_DATA))
d2fb3437 4612 trans->block_group = block_group->key.objectid;
0f9dd46c 4613
fa9c0d79 4614 btrfs_put_block_group(block_group);
80eb234a 4615 ret = 0;
be744175 4616 }
be744175 4617
0f70abe2 4618 return ret;
fec577fb 4619}
ec44a35c 4620
9ed74f2d
JB
4621static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4622 int dump_block_groups)
0f9dd46c
JB
4623{
4624 struct btrfs_block_group_cache *cache;
b742bb82 4625 int index = 0;
0f9dd46c 4626
9ed74f2d 4627 spin_lock(&info->lock);
d397712b
CM
4628 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4629 (unsigned long long)(info->total_bytes - info->bytes_used -
9ed74f2d
JB
4630 info->bytes_pinned - info->bytes_reserved -
4631 info->bytes_super),
d397712b 4632 (info->full) ? "" : "not ");
6a63209f 4633 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
9ed74f2d
JB
4634 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4635 "\n",
21380931
JB
4636 (unsigned long long)info->total_bytes,
4637 (unsigned long long)info->bytes_pinned,
4638 (unsigned long long)info->bytes_delalloc,
4639 (unsigned long long)info->bytes_may_use,
9ed74f2d
JB
4640 (unsigned long long)info->bytes_used,
4641 (unsigned long long)info->bytes_root,
4642 (unsigned long long)info->bytes_super,
4643 (unsigned long long)info->bytes_reserved);
4644 spin_unlock(&info->lock);
4645
4646 if (!dump_block_groups)
4647 return;
0f9dd46c 4648
80eb234a 4649 down_read(&info->groups_sem);
b742bb82
YZ
4650again:
4651 list_for_each_entry(cache, &info->block_groups[index], list) {
0f9dd46c 4652 spin_lock(&cache->lock);
d397712b
CM
4653 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4654 "%llu pinned %llu reserved\n",
4655 (unsigned long long)cache->key.objectid,
4656 (unsigned long long)cache->key.offset,
4657 (unsigned long long)btrfs_block_group_used(&cache->item),
4658 (unsigned long long)cache->pinned,
4659 (unsigned long long)cache->reserved);
0f9dd46c
JB
4660 btrfs_dump_free_space(cache, bytes);
4661 spin_unlock(&cache->lock);
4662 }
b742bb82
YZ
4663 if (++index < BTRFS_NR_RAID_TYPES)
4664 goto again;
80eb234a 4665 up_read(&info->groups_sem);
0f9dd46c 4666}
e8569813 4667
11833d66
YZ
4668int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4669 struct btrfs_root *root,
4670 u64 num_bytes, u64 min_alloc_size,
4671 u64 empty_size, u64 hint_byte,
4672 u64 search_end, struct btrfs_key *ins,
4673 u64 data)
fec577fb
CM
4674{
4675 int ret;
fbdc762b 4676 u64 search_start = 0;
925baedd 4677
6a63209f 4678 data = btrfs_get_alloc_profile(root, data);
98d20f67 4679again:
0ef3e66b
CM
4680 /*
4681 * the only place that sets empty_size is btrfs_realloc_node, which
4682 * is not called recursively on allocations
4683 */
83d3c969 4684 if (empty_size || root->ref_cows)
6324fbf3 4685 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
0ef3e66b 4686 num_bytes + 2 * 1024 * 1024, data, 0);
0b86a832 4687
db94535d
CM
4688 WARN_ON(num_bytes < root->sectorsize);
4689 ret = find_free_extent(trans, root, num_bytes, empty_size,
4690 search_start, search_end, hint_byte, ins,
26b8003f
CM
4691 trans->alloc_exclude_start,
4692 trans->alloc_exclude_nr, data);
3b951516 4693
98d20f67
CM
4694 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4695 num_bytes = num_bytes >> 1;
0f9dd46c 4696 num_bytes = num_bytes & ~(root->sectorsize - 1);
98d20f67 4697 num_bytes = max(num_bytes, min_alloc_size);
0ef3e66b
CM
4698 do_chunk_alloc(trans, root->fs_info->extent_root,
4699 num_bytes, data, 1);
98d20f67
CM
4700 goto again;
4701 }
817d52f8 4702 if (ret == -ENOSPC) {
0f9dd46c
JB
4703 struct btrfs_space_info *sinfo;
4704
4705 sinfo = __find_space_info(root->fs_info, data);
d397712b
CM
4706 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4707 "wanted %llu\n", (unsigned long long)data,
4708 (unsigned long long)num_bytes);
9ed74f2d 4709 dump_space_info(sinfo, num_bytes, 1);
925baedd 4710 }
0f9dd46c
JB
4711
4712 return ret;
e6dcd2dc
CM
4713}
4714
65b51a00
CM
4715int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4716{
0f9dd46c 4717 struct btrfs_block_group_cache *cache;
1f3c79a2 4718 int ret = 0;
0f9dd46c 4719
0f9dd46c
JB
4720 cache = btrfs_lookup_block_group(root->fs_info, start);
4721 if (!cache) {
d397712b
CM
4722 printk(KERN_ERR "Unable to find block group for %llu\n",
4723 (unsigned long long)start);
0f9dd46c
JB
4724 return -ENOSPC;
4725 }
1f3c79a2
LH
4726
4727 ret = btrfs_discard_extent(root, start, len);
4728
0f9dd46c 4729 btrfs_add_free_space(cache, start, len);
11833d66 4730 update_reserved_extents(cache, len, 0);
fa9c0d79 4731 btrfs_put_block_group(cache);
817d52f8 4732
e6dcd2dc
CM
4733 return ret;
4734}
4735
5d4f98a2
YZ
4736static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4737 struct btrfs_root *root,
4738 u64 parent, u64 root_objectid,
4739 u64 flags, u64 owner, u64 offset,
4740 struct btrfs_key *ins, int ref_mod)
e6dcd2dc
CM
4741{
4742 int ret;
5d4f98a2 4743 struct btrfs_fs_info *fs_info = root->fs_info;
e6dcd2dc 4744 struct btrfs_extent_item *extent_item;
5d4f98a2 4745 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 4746 struct btrfs_path *path;
5d4f98a2
YZ
4747 struct extent_buffer *leaf;
4748 int type;
4749 u32 size;
26b8003f 4750
5d4f98a2
YZ
4751 if (parent > 0)
4752 type = BTRFS_SHARED_DATA_REF_KEY;
4753 else
4754 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 4755
5d4f98a2 4756 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
4757
4758 path = btrfs_alloc_path();
4759 BUG_ON(!path);
47e4bb98 4760
b9473439 4761 path->leave_spinning = 1;
5d4f98a2
YZ
4762 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4763 ins, size);
ccd467d6 4764 BUG_ON(ret);
0f9dd46c 4765
5d4f98a2
YZ
4766 leaf = path->nodes[0];
4767 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 4768 struct btrfs_extent_item);
5d4f98a2
YZ
4769 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4770 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4771 btrfs_set_extent_flags(leaf, extent_item,
4772 flags | BTRFS_EXTENT_FLAG_DATA);
4773
4774 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4775 btrfs_set_extent_inline_ref_type(leaf, iref, type);
4776 if (parent > 0) {
4777 struct btrfs_shared_data_ref *ref;
4778 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4779 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4780 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4781 } else {
4782 struct btrfs_extent_data_ref *ref;
4783 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4784 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4785 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4786 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4787 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4788 }
47e4bb98
CM
4789
4790 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 4791 btrfs_free_path(path);
f510cfec 4792
5d4f98a2
YZ
4793 ret = update_block_group(trans, root, ins->objectid, ins->offset,
4794 1, 0);
f5947066 4795 if (ret) {
d397712b
CM
4796 printk(KERN_ERR "btrfs update block group failed for %llu "
4797 "%llu\n", (unsigned long long)ins->objectid,
4798 (unsigned long long)ins->offset);
f5947066
CM
4799 BUG();
4800 }
e6dcd2dc
CM
4801 return ret;
4802}
4803
5d4f98a2
YZ
4804static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4805 struct btrfs_root *root,
4806 u64 parent, u64 root_objectid,
4807 u64 flags, struct btrfs_disk_key *key,
4808 int level, struct btrfs_key *ins)
e6dcd2dc
CM
4809{
4810 int ret;
5d4f98a2
YZ
4811 struct btrfs_fs_info *fs_info = root->fs_info;
4812 struct btrfs_extent_item *extent_item;
4813 struct btrfs_tree_block_info *block_info;
4814 struct btrfs_extent_inline_ref *iref;
4815 struct btrfs_path *path;
4816 struct extent_buffer *leaf;
4817 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
1c2308f8 4818
5d4f98a2
YZ
4819 path = btrfs_alloc_path();
4820 BUG_ON(!path);
56bec294 4821
5d4f98a2
YZ
4822 path->leave_spinning = 1;
4823 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4824 ins, size);
56bec294 4825 BUG_ON(ret);
5d4f98a2
YZ
4826
4827 leaf = path->nodes[0];
4828 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4829 struct btrfs_extent_item);
4830 btrfs_set_extent_refs(leaf, extent_item, 1);
4831 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4832 btrfs_set_extent_flags(leaf, extent_item,
4833 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4834 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4835
4836 btrfs_set_tree_block_key(leaf, block_info, key);
4837 btrfs_set_tree_block_level(leaf, block_info, level);
4838
4839 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4840 if (parent > 0) {
4841 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4842 btrfs_set_extent_inline_ref_type(leaf, iref,
4843 BTRFS_SHARED_BLOCK_REF_KEY);
4844 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4845 } else {
4846 btrfs_set_extent_inline_ref_type(leaf, iref,
4847 BTRFS_TREE_BLOCK_REF_KEY);
4848 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4849 }
4850
4851 btrfs_mark_buffer_dirty(leaf);
4852 btrfs_free_path(path);
4853
4854 ret = update_block_group(trans, root, ins->objectid, ins->offset,
4855 1, 0);
4856 if (ret) {
4857 printk(KERN_ERR "btrfs update block group failed for %llu "
4858 "%llu\n", (unsigned long long)ins->objectid,
4859 (unsigned long long)ins->offset);
4860 BUG();
4861 }
4862 return ret;
4863}
4864
4865int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4866 struct btrfs_root *root,
4867 u64 root_objectid, u64 owner,
4868 u64 offset, struct btrfs_key *ins)
4869{
4870 int ret;
4871
4872 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4873
4874 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4875 0, root_objectid, owner, offset,
4876 BTRFS_ADD_DELAYED_EXTENT, NULL);
e6dcd2dc
CM
4877 return ret;
4878}
e02119d5
CM
4879
4880/*
4881 * this is used by the tree logging recovery code. It records that
4882 * an extent has been allocated and makes sure to clear the free
4883 * space cache bits as well
4884 */
5d4f98a2
YZ
4885int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4886 struct btrfs_root *root,
4887 u64 root_objectid, u64 owner, u64 offset,
4888 struct btrfs_key *ins)
e02119d5
CM
4889{
4890 int ret;
4891 struct btrfs_block_group_cache *block_group;
11833d66
YZ
4892 struct btrfs_caching_control *caching_ctl;
4893 u64 start = ins->objectid;
4894 u64 num_bytes = ins->offset;
e02119d5 4895
e02119d5 4896 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
817d52f8 4897 cache_block_group(block_group);
11833d66 4898 caching_ctl = get_caching_control(block_group);
e02119d5 4899
11833d66
YZ
4900 if (!caching_ctl) {
4901 BUG_ON(!block_group_cache_done(block_group));
4902 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4903 BUG_ON(ret);
4904 } else {
4905 mutex_lock(&caching_ctl->mutex);
4906
4907 if (start >= caching_ctl->progress) {
4908 ret = add_excluded_extent(root, start, num_bytes);
4909 BUG_ON(ret);
4910 } else if (start + num_bytes <= caching_ctl->progress) {
4911 ret = btrfs_remove_free_space(block_group,
4912 start, num_bytes);
4913 BUG_ON(ret);
4914 } else {
4915 num_bytes = caching_ctl->progress - start;
4916 ret = btrfs_remove_free_space(block_group,
4917 start, num_bytes);
4918 BUG_ON(ret);
4919
4920 start = caching_ctl->progress;
4921 num_bytes = ins->objectid + ins->offset -
4922 caching_ctl->progress;
4923 ret = add_excluded_extent(root, start, num_bytes);
4924 BUG_ON(ret);
4925 }
4926
4927 mutex_unlock(&caching_ctl->mutex);
4928 put_caching_control(caching_ctl);
4929 }
4930
4931 update_reserved_extents(block_group, ins->offset, 1);
fa9c0d79 4932 btrfs_put_block_group(block_group);
5d4f98a2
YZ
4933 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4934 0, owner, offset, ins, 1);
e02119d5
CM
4935 return ret;
4936}
4937
e6dcd2dc
CM
4938/*
4939 * finds a free extent and does all the dirty work required for allocation
4940 * returns the key for the extent through ins, and a tree buffer for
4941 * the first block of the extent through buf.
4942 *
4943 * returns 0 if everything worked, non-zero otherwise.
4944 */
5d4f98a2
YZ
4945static int alloc_tree_block(struct btrfs_trans_handle *trans,
4946 struct btrfs_root *root,
4947 u64 num_bytes, u64 parent, u64 root_objectid,
4948 struct btrfs_disk_key *key, int level,
4949 u64 empty_size, u64 hint_byte, u64 search_end,
4950 struct btrfs_key *ins)
e6dcd2dc
CM
4951{
4952 int ret;
5d4f98a2
YZ
4953 u64 flags = 0;
4954
11833d66
YZ
4955 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4956 empty_size, hint_byte, search_end,
4957 ins, 0);
817d52f8
JB
4958 if (ret)
4959 return ret;
5d4f98a2
YZ
4960
4961 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4962 if (parent == 0)
4963 parent = ins->objectid;
4964 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4965 } else
4966 BUG_ON(parent > 0);
4967
d00aff00 4968 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5d4f98a2
YZ
4969 struct btrfs_delayed_extent_op *extent_op;
4970 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4971 BUG_ON(!extent_op);
4972 if (key)
4973 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4974 else
4975 memset(&extent_op->key, 0, sizeof(extent_op->key));
4976 extent_op->flags_to_set = flags;
4977 extent_op->update_key = 1;
4978 extent_op->update_flags = 1;
4979 extent_op->is_data = 0;
4980
4981 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4982 ins->offset, parent, root_objectid,
4983 level, BTRFS_ADD_DELAYED_EXTENT,
4984 extent_op);
d00aff00 4985 BUG_ON(ret);
d00aff00 4986 }
86b9f2ec
YZ
4987
4988 if (root_objectid == root->root_key.objectid) {
4989 u64 used;
4990 spin_lock(&root->node_lock);
4991 used = btrfs_root_used(&root->root_item) + num_bytes;
4992 btrfs_set_root_used(&root->root_item, used);
4993 spin_unlock(&root->node_lock);
4994 }
925baedd 4995 return ret;
fec577fb 4996}
65b51a00
CM
4997
4998struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4999 struct btrfs_root *root,
4008c04a
CM
5000 u64 bytenr, u32 blocksize,
5001 int level)
65b51a00
CM
5002{
5003 struct extent_buffer *buf;
5004
5005 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5006 if (!buf)
5007 return ERR_PTR(-ENOMEM);
5008 btrfs_set_header_generation(buf, trans->transid);
4008c04a 5009 btrfs_set_buffer_lockdep_class(buf, level);
65b51a00
CM
5010 btrfs_tree_lock(buf);
5011 clean_tree_block(trans, root, buf);
b4ce94de
CM
5012
5013 btrfs_set_lock_blocking(buf);
65b51a00 5014 btrfs_set_buffer_uptodate(buf);
b4ce94de 5015
d0c803c4 5016 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8cef4e16
YZ
5017 /*
5018 * we allow two log transactions at a time, use different
5019 * EXENT bit to differentiate dirty pages.
5020 */
5021 if (root->log_transid % 2 == 0)
5022 set_extent_dirty(&root->dirty_log_pages, buf->start,
5023 buf->start + buf->len - 1, GFP_NOFS);
5024 else
5025 set_extent_new(&root->dirty_log_pages, buf->start,
5026 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4
CM
5027 } else {
5028 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 5029 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 5030 }
65b51a00 5031 trans->blocks_used++;
b4ce94de 5032 /* this returns a buffer locked for blocking */
65b51a00
CM
5033 return buf;
5034}
5035
fec577fb
CM
5036/*
5037 * helper function to allocate a block for a given tree
5038 * returns the tree buffer or NULL.
5039 */
5f39d397 5040struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
5041 struct btrfs_root *root, u32 blocksize,
5042 u64 parent, u64 root_objectid,
5043 struct btrfs_disk_key *key, int level,
5044 u64 hint, u64 empty_size)
fec577fb 5045{
e2fa7227 5046 struct btrfs_key ins;
fec577fb 5047 int ret;
5f39d397 5048 struct extent_buffer *buf;
fec577fb 5049
5d4f98a2
YZ
5050 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
5051 key, level, empty_size, hint, (u64)-1, &ins);
fec577fb 5052 if (ret) {
54aa1f4d
CM
5053 BUG_ON(ret > 0);
5054 return ERR_PTR(ret);
fec577fb 5055 }
55c69072 5056
4008c04a
CM
5057 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5058 blocksize, level);
fec577fb
CM
5059 return buf;
5060}
a28ec197 5061
2c47e605
YZ
5062struct walk_control {
5063 u64 refs[BTRFS_MAX_LEVEL];
5064 u64 flags[BTRFS_MAX_LEVEL];
5065 struct btrfs_key update_progress;
5066 int stage;
5067 int level;
5068 int shared_level;
5069 int update_ref;
5070 int keep_locks;
1c4850e2
YZ
5071 int reada_slot;
5072 int reada_count;
2c47e605
YZ
5073};
5074
5075#define DROP_REFERENCE 1
5076#define UPDATE_BACKREF 2
5077
1c4850e2
YZ
5078static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5079 struct btrfs_root *root,
5080 struct walk_control *wc,
5081 struct btrfs_path *path)
6407bf6d 5082{
1c4850e2
YZ
5083 u64 bytenr;
5084 u64 generation;
5085 u64 refs;
94fcca9f 5086 u64 flags;
1c4850e2 5087 u64 last = 0;
5d4f98a2 5088 u32 nritems;
1c4850e2
YZ
5089 u32 blocksize;
5090 struct btrfs_key key;
5091 struct extent_buffer *eb;
6407bf6d 5092 int ret;
1c4850e2
YZ
5093 int slot;
5094 int nread = 0;
6407bf6d 5095
1c4850e2
YZ
5096 if (path->slots[wc->level] < wc->reada_slot) {
5097 wc->reada_count = wc->reada_count * 2 / 3;
5098 wc->reada_count = max(wc->reada_count, 2);
5099 } else {
5100 wc->reada_count = wc->reada_count * 3 / 2;
5101 wc->reada_count = min_t(int, wc->reada_count,
5102 BTRFS_NODEPTRS_PER_BLOCK(root));
5103 }
7bb86316 5104
1c4850e2
YZ
5105 eb = path->nodes[wc->level];
5106 nritems = btrfs_header_nritems(eb);
5107 blocksize = btrfs_level_size(root, wc->level - 1);
bd56b302 5108
1c4850e2
YZ
5109 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5110 if (nread >= wc->reada_count)
5111 break;
bd56b302 5112
2dd3e67b 5113 cond_resched();
1c4850e2
YZ
5114 bytenr = btrfs_node_blockptr(eb, slot);
5115 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 5116
1c4850e2
YZ
5117 if (slot == path->slots[wc->level])
5118 goto reada;
5d4f98a2 5119
1c4850e2
YZ
5120 if (wc->stage == UPDATE_BACKREF &&
5121 generation <= root->root_key.offset)
bd56b302
CM
5122 continue;
5123
94fcca9f
YZ
5124 /* We don't lock the tree block, it's OK to be racy here */
5125 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5126 &refs, &flags);
5127 BUG_ON(ret);
5128 BUG_ON(refs == 0);
5129
1c4850e2 5130 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
5131 if (refs == 1)
5132 goto reada;
bd56b302 5133
94fcca9f
YZ
5134 if (wc->level == 1 &&
5135 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5136 continue;
1c4850e2
YZ
5137 if (!wc->update_ref ||
5138 generation <= root->root_key.offset)
5139 continue;
5140 btrfs_node_key_to_cpu(eb, &key, slot);
5141 ret = btrfs_comp_cpu_keys(&key,
5142 &wc->update_progress);
5143 if (ret < 0)
5144 continue;
94fcca9f
YZ
5145 } else {
5146 if (wc->level == 1 &&
5147 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5148 continue;
6407bf6d 5149 }
1c4850e2
YZ
5150reada:
5151 ret = readahead_tree_block(root, bytenr, blocksize,
5152 generation);
5153 if (ret)
bd56b302 5154 break;
1c4850e2
YZ
5155 last = bytenr + blocksize;
5156 nread++;
20524f02 5157 }
1c4850e2 5158 wc->reada_slot = slot;
20524f02 5159}
2c47e605 5160
f82d02d9 5161/*
2c47e605
YZ
5162 * hepler to process tree block while walking down the tree.
5163 *
2c47e605
YZ
5164 * when wc->stage == UPDATE_BACKREF, this function updates
5165 * back refs for pointers in the block.
5166 *
5167 * NOTE: return value 1 means we should stop walking down.
f82d02d9 5168 */
2c47e605 5169static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 5170 struct btrfs_root *root,
2c47e605 5171 struct btrfs_path *path,
94fcca9f 5172 struct walk_control *wc, int lookup_info)
f82d02d9 5173{
2c47e605
YZ
5174 int level = wc->level;
5175 struct extent_buffer *eb = path->nodes[level];
2c47e605 5176 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
5177 int ret;
5178
2c47e605
YZ
5179 if (wc->stage == UPDATE_BACKREF &&
5180 btrfs_header_owner(eb) != root->root_key.objectid)
5181 return 1;
f82d02d9 5182
2c47e605
YZ
5183 /*
5184 * when reference count of tree block is 1, it won't increase
5185 * again. once full backref flag is set, we never clear it.
5186 */
94fcca9f
YZ
5187 if (lookup_info &&
5188 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5189 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605
YZ
5190 BUG_ON(!path->locks[level]);
5191 ret = btrfs_lookup_extent_info(trans, root,
5192 eb->start, eb->len,
5193 &wc->refs[level],
5194 &wc->flags[level]);
5195 BUG_ON(ret);
5196 BUG_ON(wc->refs[level] == 0);
5197 }
5d4f98a2 5198
2c47e605
YZ
5199 if (wc->stage == DROP_REFERENCE) {
5200 if (wc->refs[level] > 1)
5201 return 1;
f82d02d9 5202
2c47e605
YZ
5203 if (path->locks[level] && !wc->keep_locks) {
5204 btrfs_tree_unlock(eb);
5205 path->locks[level] = 0;
5206 }
5207 return 0;
5208 }
f82d02d9 5209
2c47e605
YZ
5210 /* wc->stage == UPDATE_BACKREF */
5211 if (!(wc->flags[level] & flag)) {
5212 BUG_ON(!path->locks[level]);
5213 ret = btrfs_inc_ref(trans, root, eb, 1);
f82d02d9 5214 BUG_ON(ret);
2c47e605
YZ
5215 ret = btrfs_dec_ref(trans, root, eb, 0);
5216 BUG_ON(ret);
5217 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5218 eb->len, flag, 0);
5219 BUG_ON(ret);
5220 wc->flags[level] |= flag;
5221 }
5222
5223 /*
5224 * the block is shared by multiple trees, so it's not good to
5225 * keep the tree lock
5226 */
5227 if (path->locks[level] && level > 0) {
5228 btrfs_tree_unlock(eb);
5229 path->locks[level] = 0;
5230 }
5231 return 0;
5232}
5233
1c4850e2
YZ
5234/*
5235 * hepler to process tree block pointer.
5236 *
5237 * when wc->stage == DROP_REFERENCE, this function checks
5238 * reference count of the block pointed to. if the block
5239 * is shared and we need update back refs for the subtree
5240 * rooted at the block, this function changes wc->stage to
5241 * UPDATE_BACKREF. if the block is shared and there is no
5242 * need to update back, this function drops the reference
5243 * to the block.
5244 *
5245 * NOTE: return value 1 means we should stop walking down.
5246 */
5247static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5248 struct btrfs_root *root,
5249 struct btrfs_path *path,
94fcca9f 5250 struct walk_control *wc, int *lookup_info)
1c4850e2
YZ
5251{
5252 u64 bytenr;
5253 u64 generation;
5254 u64 parent;
5255 u32 blocksize;
5256 struct btrfs_key key;
5257 struct extent_buffer *next;
5258 int level = wc->level;
5259 int reada = 0;
5260 int ret = 0;
5261
5262 generation = btrfs_node_ptr_generation(path->nodes[level],
5263 path->slots[level]);
5264 /*
5265 * if the lower level block was created before the snapshot
5266 * was created, we know there is no need to update back refs
5267 * for the subtree
5268 */
5269 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
5270 generation <= root->root_key.offset) {
5271 *lookup_info = 1;
1c4850e2 5272 return 1;
94fcca9f 5273 }
1c4850e2
YZ
5274
5275 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5276 blocksize = btrfs_level_size(root, level - 1);
5277
5278 next = btrfs_find_tree_block(root, bytenr, blocksize);
5279 if (!next) {
5280 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
90d2c51d
MX
5281 if (!next)
5282 return -ENOMEM;
1c4850e2
YZ
5283 reada = 1;
5284 }
5285 btrfs_tree_lock(next);
5286 btrfs_set_lock_blocking(next);
5287
94fcca9f
YZ
5288 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5289 &wc->refs[level - 1],
5290 &wc->flags[level - 1]);
5291 BUG_ON(ret);
5292 BUG_ON(wc->refs[level - 1] == 0);
5293 *lookup_info = 0;
1c4850e2 5294
94fcca9f 5295 if (wc->stage == DROP_REFERENCE) {
1c4850e2 5296 if (wc->refs[level - 1] > 1) {
94fcca9f
YZ
5297 if (level == 1 &&
5298 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5299 goto skip;
5300
1c4850e2
YZ
5301 if (!wc->update_ref ||
5302 generation <= root->root_key.offset)
5303 goto skip;
5304
5305 btrfs_node_key_to_cpu(path->nodes[level], &key,
5306 path->slots[level]);
5307 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5308 if (ret < 0)
5309 goto skip;
5310
5311 wc->stage = UPDATE_BACKREF;
5312 wc->shared_level = level - 1;
5313 }
94fcca9f
YZ
5314 } else {
5315 if (level == 1 &&
5316 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5317 goto skip;
1c4850e2
YZ
5318 }
5319
5320 if (!btrfs_buffer_uptodate(next, generation)) {
5321 btrfs_tree_unlock(next);
5322 free_extent_buffer(next);
5323 next = NULL;
94fcca9f 5324 *lookup_info = 1;
1c4850e2
YZ
5325 }
5326
5327 if (!next) {
5328 if (reada && level == 1)
5329 reada_walk_down(trans, root, wc, path);
5330 next = read_tree_block(root, bytenr, blocksize, generation);
5331 btrfs_tree_lock(next);
5332 btrfs_set_lock_blocking(next);
5333 }
5334
5335 level--;
5336 BUG_ON(level != btrfs_header_level(next));
5337 path->nodes[level] = next;
5338 path->slots[level] = 0;
5339 path->locks[level] = 1;
5340 wc->level = level;
5341 if (wc->level == 1)
5342 wc->reada_slot = 0;
5343 return 0;
5344skip:
5345 wc->refs[level - 1] = 0;
5346 wc->flags[level - 1] = 0;
94fcca9f
YZ
5347 if (wc->stage == DROP_REFERENCE) {
5348 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5349 parent = path->nodes[level]->start;
5350 } else {
5351 BUG_ON(root->root_key.objectid !=
5352 btrfs_header_owner(path->nodes[level]));
5353 parent = 0;
5354 }
1c4850e2 5355
94fcca9f
YZ
5356 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5357 root->root_key.objectid, level - 1, 0);
5358 BUG_ON(ret);
1c4850e2 5359 }
1c4850e2
YZ
5360 btrfs_tree_unlock(next);
5361 free_extent_buffer(next);
94fcca9f 5362 *lookup_info = 1;
1c4850e2
YZ
5363 return 1;
5364}
5365
2c47e605
YZ
5366/*
5367 * hepler to process tree block while walking up the tree.
5368 *
5369 * when wc->stage == DROP_REFERENCE, this function drops
5370 * reference count on the block.
5371 *
5372 * when wc->stage == UPDATE_BACKREF, this function changes
5373 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5374 * to UPDATE_BACKREF previously while processing the block.
5375 *
5376 * NOTE: return value 1 means we should stop walking up.
5377 */
5378static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5379 struct btrfs_root *root,
5380 struct btrfs_path *path,
5381 struct walk_control *wc)
5382{
5383 int ret = 0;
5384 int level = wc->level;
5385 struct extent_buffer *eb = path->nodes[level];
5386 u64 parent = 0;
5387
5388 if (wc->stage == UPDATE_BACKREF) {
5389 BUG_ON(wc->shared_level < level);
5390 if (level < wc->shared_level)
5391 goto out;
5392
2c47e605
YZ
5393 ret = find_next_key(path, level + 1, &wc->update_progress);
5394 if (ret > 0)
5395 wc->update_ref = 0;
5396
5397 wc->stage = DROP_REFERENCE;
5398 wc->shared_level = -1;
5399 path->slots[level] = 0;
5400
5401 /*
5402 * check reference count again if the block isn't locked.
5403 * we should start walking down the tree again if reference
5404 * count is one.
5405 */
5406 if (!path->locks[level]) {
5407 BUG_ON(level == 0);
5408 btrfs_tree_lock(eb);
5409 btrfs_set_lock_blocking(eb);
5410 path->locks[level] = 1;
5411
5412 ret = btrfs_lookup_extent_info(trans, root,
5413 eb->start, eb->len,
5414 &wc->refs[level],
5415 &wc->flags[level]);
f82d02d9 5416 BUG_ON(ret);
2c47e605
YZ
5417 BUG_ON(wc->refs[level] == 0);
5418 if (wc->refs[level] == 1) {
5419 btrfs_tree_unlock(eb);
5420 path->locks[level] = 0;
5421 return 1;
5422 }
f82d02d9 5423 }
2c47e605 5424 }
f82d02d9 5425
2c47e605
YZ
5426 /* wc->stage == DROP_REFERENCE */
5427 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 5428
2c47e605
YZ
5429 if (wc->refs[level] == 1) {
5430 if (level == 0) {
5431 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5432 ret = btrfs_dec_ref(trans, root, eb, 1);
5433 else
5434 ret = btrfs_dec_ref(trans, root, eb, 0);
5435 BUG_ON(ret);
5436 }
5437 /* make block locked assertion in clean_tree_block happy */
5438 if (!path->locks[level] &&
5439 btrfs_header_generation(eb) == trans->transid) {
5440 btrfs_tree_lock(eb);
5441 btrfs_set_lock_blocking(eb);
5442 path->locks[level] = 1;
5443 }
5444 clean_tree_block(trans, root, eb);
5445 }
5446
5447 if (eb == root->node) {
5448 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5449 parent = eb->start;
5450 else
5451 BUG_ON(root->root_key.objectid !=
5452 btrfs_header_owner(eb));
5453 } else {
5454 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5455 parent = path->nodes[level + 1]->start;
5456 else
5457 BUG_ON(root->root_key.objectid !=
5458 btrfs_header_owner(path->nodes[level + 1]));
f82d02d9 5459 }
f82d02d9 5460
2c47e605
YZ
5461 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
5462 root->root_key.objectid, level, 0);
f82d02d9 5463 BUG_ON(ret);
2c47e605
YZ
5464out:
5465 wc->refs[level] = 0;
5466 wc->flags[level] = 0;
5467 return ret;
5468}
5469
5470static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5471 struct btrfs_root *root,
5472 struct btrfs_path *path,
5473 struct walk_control *wc)
5474{
2c47e605 5475 int level = wc->level;
94fcca9f 5476 int lookup_info = 1;
2c47e605
YZ
5477 int ret;
5478
5479 while (level >= 0) {
94fcca9f 5480 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
5481 if (ret > 0)
5482 break;
5483
5484 if (level == 0)
5485 break;
5486
7a7965f8
YZ
5487 if (path->slots[level] >=
5488 btrfs_header_nritems(path->nodes[level]))
5489 break;
5490
94fcca9f 5491 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
5492 if (ret > 0) {
5493 path->slots[level]++;
5494 continue;
90d2c51d
MX
5495 } else if (ret < 0)
5496 return ret;
1c4850e2 5497 level = wc->level;
f82d02d9 5498 }
f82d02d9
YZ
5499 return 0;
5500}
5501
d397712b 5502static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 5503 struct btrfs_root *root,
f82d02d9 5504 struct btrfs_path *path,
2c47e605 5505 struct walk_control *wc, int max_level)
20524f02 5506{
2c47e605 5507 int level = wc->level;
20524f02 5508 int ret;
9f3a7427 5509
2c47e605
YZ
5510 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5511 while (level < max_level && path->nodes[level]) {
5512 wc->level = level;
5513 if (path->slots[level] + 1 <
5514 btrfs_header_nritems(path->nodes[level])) {
5515 path->slots[level]++;
20524f02
CM
5516 return 0;
5517 } else {
2c47e605
YZ
5518 ret = walk_up_proc(trans, root, path, wc);
5519 if (ret > 0)
5520 return 0;
bd56b302 5521
2c47e605
YZ
5522 if (path->locks[level]) {
5523 btrfs_tree_unlock(path->nodes[level]);
5524 path->locks[level] = 0;
f82d02d9 5525 }
2c47e605
YZ
5526 free_extent_buffer(path->nodes[level]);
5527 path->nodes[level] = NULL;
5528 level++;
20524f02
CM
5529 }
5530 }
5531 return 1;
5532}
5533
9aca1d51 5534/*
2c47e605
YZ
5535 * drop a subvolume tree.
5536 *
5537 * this function traverses the tree freeing any blocks that only
5538 * referenced by the tree.
5539 *
5540 * when a shared tree block is found. this function decreases its
5541 * reference count by one. if update_ref is true, this function
5542 * also make sure backrefs for the shared block and all lower level
5543 * blocks are properly updated.
9aca1d51 5544 */
2c47e605 5545int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
20524f02 5546{
5caf2a00 5547 struct btrfs_path *path;
2c47e605
YZ
5548 struct btrfs_trans_handle *trans;
5549 struct btrfs_root *tree_root = root->fs_info->tree_root;
9f3a7427 5550 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
5551 struct walk_control *wc;
5552 struct btrfs_key key;
5553 int err = 0;
5554 int ret;
5555 int level;
20524f02 5556
5caf2a00
CM
5557 path = btrfs_alloc_path();
5558 BUG_ON(!path);
20524f02 5559
2c47e605
YZ
5560 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5561 BUG_ON(!wc);
5562
5563 trans = btrfs_start_transaction(tree_root, 1);
5564
9f3a7427 5565 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 5566 level = btrfs_header_level(root->node);
5d4f98a2
YZ
5567 path->nodes[level] = btrfs_lock_root_node(root);
5568 btrfs_set_lock_blocking(path->nodes[level]);
9f3a7427 5569 path->slots[level] = 0;
5d4f98a2 5570 path->locks[level] = 1;
2c47e605
YZ
5571 memset(&wc->update_progress, 0,
5572 sizeof(wc->update_progress));
9f3a7427 5573 } else {
9f3a7427 5574 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
5575 memcpy(&wc->update_progress, &key,
5576 sizeof(wc->update_progress));
5577
6702ed49 5578 level = root_item->drop_level;
2c47e605 5579 BUG_ON(level == 0);
6702ed49 5580 path->lowest_level = level;
2c47e605
YZ
5581 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5582 path->lowest_level = 0;
5583 if (ret < 0) {
5584 err = ret;
9f3a7427
CM
5585 goto out;
5586 }
1c4850e2 5587 WARN_ON(ret > 0);
2c47e605 5588
7d9eb12c
CM
5589 /*
5590 * unlock our path, this is safe because only this
5591 * function is allowed to delete this snapshot
5592 */
5d4f98a2 5593 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
5594
5595 level = btrfs_header_level(root->node);
5596 while (1) {
5597 btrfs_tree_lock(path->nodes[level]);
5598 btrfs_set_lock_blocking(path->nodes[level]);
5599
5600 ret = btrfs_lookup_extent_info(trans, root,
5601 path->nodes[level]->start,
5602 path->nodes[level]->len,
5603 &wc->refs[level],
5604 &wc->flags[level]);
5605 BUG_ON(ret);
5606 BUG_ON(wc->refs[level] == 0);
5607
5608 if (level == root_item->drop_level)
5609 break;
5610
5611 btrfs_tree_unlock(path->nodes[level]);
5612 WARN_ON(wc->refs[level] != 1);
5613 level--;
5614 }
9f3a7427 5615 }
2c47e605
YZ
5616
5617 wc->level = level;
5618 wc->shared_level = -1;
5619 wc->stage = DROP_REFERENCE;
5620 wc->update_ref = update_ref;
5621 wc->keep_locks = 0;
1c4850e2 5622 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
2c47e605 5623
d397712b 5624 while (1) {
2c47e605
YZ
5625 ret = walk_down_tree(trans, root, path, wc);
5626 if (ret < 0) {
5627 err = ret;
20524f02 5628 break;
2c47e605 5629 }
9aca1d51 5630
2c47e605
YZ
5631 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5632 if (ret < 0) {
5633 err = ret;
20524f02 5634 break;
2c47e605
YZ
5635 }
5636
5637 if (ret > 0) {
5638 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
5639 break;
5640 }
2c47e605
YZ
5641
5642 if (wc->stage == DROP_REFERENCE) {
5643 level = wc->level;
5644 btrfs_node_key(path->nodes[level],
5645 &root_item->drop_progress,
5646 path->slots[level]);
5647 root_item->drop_level = level;
5648 }
5649
5650 BUG_ON(wc->level == 0);
5651 if (trans->transaction->in_commit ||
5652 trans->transaction->delayed_refs.flushing) {
5653 ret = btrfs_update_root(trans, tree_root,
5654 &root->root_key,
5655 root_item);
5656 BUG_ON(ret);
5657
5658 btrfs_end_transaction(trans, tree_root);
5659 trans = btrfs_start_transaction(tree_root, 1);
5660 } else {
5661 unsigned long update;
c3e69d58
CM
5662 update = trans->delayed_ref_updates;
5663 trans->delayed_ref_updates = 0;
5664 if (update)
2c47e605
YZ
5665 btrfs_run_delayed_refs(trans, tree_root,
5666 update);
c3e69d58 5667 }
20524f02 5668 }
2c47e605
YZ
5669 btrfs_release_path(root, path);
5670 BUG_ON(err);
5671
5672 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5673 BUG_ON(ret);
5674
76dda93c
YZ
5675 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5676 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5677 NULL, NULL);
5678 BUG_ON(ret < 0);
5679 if (ret > 0) {
5680 ret = btrfs_del_orphan_item(trans, tree_root,
5681 root->root_key.objectid);
5682 BUG_ON(ret);
5683 }
5684 }
5685
5686 if (root->in_radix) {
5687 btrfs_free_fs_root(tree_root->fs_info, root);
5688 } else {
5689 free_extent_buffer(root->node);
5690 free_extent_buffer(root->commit_root);
5691 kfree(root);
5692 }
9f3a7427 5693out:
2c47e605
YZ
5694 btrfs_end_transaction(trans, tree_root);
5695 kfree(wc);
5caf2a00 5696 btrfs_free_path(path);
2c47e605 5697 return err;
20524f02 5698}
9078a3e1 5699
2c47e605
YZ
5700/*
5701 * drop subtree rooted at tree block 'node'.
5702 *
5703 * NOTE: this function will unlock and release tree block 'node'
5704 */
f82d02d9
YZ
5705int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5706 struct btrfs_root *root,
5707 struct extent_buffer *node,
5708 struct extent_buffer *parent)
5709{
5710 struct btrfs_path *path;
2c47e605 5711 struct walk_control *wc;
f82d02d9
YZ
5712 int level;
5713 int parent_level;
5714 int ret = 0;
5715 int wret;
5716
2c47e605
YZ
5717 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5718
f82d02d9
YZ
5719 path = btrfs_alloc_path();
5720 BUG_ON(!path);
5721
2c47e605
YZ
5722 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5723 BUG_ON(!wc);
5724
b9447ef8 5725 btrfs_assert_tree_locked(parent);
f82d02d9
YZ
5726 parent_level = btrfs_header_level(parent);
5727 extent_buffer_get(parent);
5728 path->nodes[parent_level] = parent;
5729 path->slots[parent_level] = btrfs_header_nritems(parent);
5730
b9447ef8 5731 btrfs_assert_tree_locked(node);
f82d02d9 5732 level = btrfs_header_level(node);
f82d02d9
YZ
5733 path->nodes[level] = node;
5734 path->slots[level] = 0;
2c47e605
YZ
5735 path->locks[level] = 1;
5736
5737 wc->refs[parent_level] = 1;
5738 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5739 wc->level = level;
5740 wc->shared_level = -1;
5741 wc->stage = DROP_REFERENCE;
5742 wc->update_ref = 0;
5743 wc->keep_locks = 1;
1c4850e2 5744 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
f82d02d9
YZ
5745
5746 while (1) {
2c47e605
YZ
5747 wret = walk_down_tree(trans, root, path, wc);
5748 if (wret < 0) {
f82d02d9 5749 ret = wret;
f82d02d9 5750 break;
2c47e605 5751 }
f82d02d9 5752
2c47e605 5753 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
5754 if (wret < 0)
5755 ret = wret;
5756 if (wret != 0)
5757 break;
5758 }
5759
2c47e605 5760 kfree(wc);
f82d02d9
YZ
5761 btrfs_free_path(path);
5762 return ret;
5763}
5764
5d4f98a2 5765#if 0
8e7bf94f
CM
5766static unsigned long calc_ra(unsigned long start, unsigned long last,
5767 unsigned long nr)
5768{
5769 return min(last, start + nr - 1);
5770}
5771
d397712b 5772static noinline int relocate_inode_pages(struct inode *inode, u64 start,
98ed5174 5773 u64 len)
edbd8d4e
CM
5774{
5775 u64 page_start;
5776 u64 page_end;
1a40e23b 5777 unsigned long first_index;
edbd8d4e 5778 unsigned long last_index;
edbd8d4e
CM
5779 unsigned long i;
5780 struct page *page;
d1310b2e 5781 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4313b399 5782 struct file_ra_state *ra;
3eaa2885 5783 struct btrfs_ordered_extent *ordered;
1a40e23b
ZY
5784 unsigned int total_read = 0;
5785 unsigned int total_dirty = 0;
5786 int ret = 0;
4313b399
CM
5787
5788 ra = kzalloc(sizeof(*ra), GFP_NOFS);
edbd8d4e
CM
5789
5790 mutex_lock(&inode->i_mutex);
1a40e23b 5791 first_index = start >> PAGE_CACHE_SHIFT;
edbd8d4e
CM
5792 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
5793
1a40e23b
ZY
5794 /* make sure the dirty trick played by the caller work */
5795 ret = invalidate_inode_pages2_range(inode->i_mapping,
5796 first_index, last_index);
5797 if (ret)
5798 goto out_unlock;
8e7bf94f 5799
4313b399 5800 file_ra_state_init(ra, inode->i_mapping);
edbd8d4e 5801
1a40e23b
ZY
5802 for (i = first_index ; i <= last_index; i++) {
5803 if (total_read % ra->ra_pages == 0) {
8e7bf94f 5804 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
1a40e23b 5805 calc_ra(i, last_index, ra->ra_pages));
8e7bf94f
CM
5806 }
5807 total_read++;
3eaa2885
CM
5808again:
5809 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
1a40e23b 5810 BUG_ON(1);
edbd8d4e 5811 page = grab_cache_page(inode->i_mapping, i);
a061fc8d 5812 if (!page) {
1a40e23b 5813 ret = -ENOMEM;
edbd8d4e 5814 goto out_unlock;
a061fc8d 5815 }
edbd8d4e
CM
5816 if (!PageUptodate(page)) {
5817 btrfs_readpage(NULL, page);
5818 lock_page(page);
5819 if (!PageUptodate(page)) {
5820 unlock_page(page);
5821 page_cache_release(page);
1a40e23b 5822 ret = -EIO;
edbd8d4e
CM
5823 goto out_unlock;
5824 }
5825 }
ec44a35c 5826 wait_on_page_writeback(page);
3eaa2885 5827
edbd8d4e
CM
5828 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
5829 page_end = page_start + PAGE_CACHE_SIZE - 1;
d1310b2e 5830 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
edbd8d4e 5831
3eaa2885
CM
5832 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5833 if (ordered) {
5834 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5835 unlock_page(page);
5836 page_cache_release(page);
5837 btrfs_start_ordered_extent(inode, ordered, 1);
5838 btrfs_put_ordered_extent(ordered);
5839 goto again;
5840 }
5841 set_page_extent_mapped(page);
5842
1a40e23b
ZY
5843 if (i == first_index)
5844 set_extent_bits(io_tree, page_start, page_end,
5845 EXTENT_BOUNDARY, GFP_NOFS);
1f80e4db 5846 btrfs_set_extent_delalloc(inode, page_start, page_end);
1a40e23b 5847
a061fc8d 5848 set_page_dirty(page);
1a40e23b 5849 total_dirty++;
edbd8d4e 5850
d1310b2e 5851 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
edbd8d4e
CM
5852 unlock_page(page);
5853 page_cache_release(page);
5854 }
5855
5856out_unlock:
ec44a35c 5857 kfree(ra);
edbd8d4e 5858 mutex_unlock(&inode->i_mutex);
1a40e23b
ZY
5859 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
5860 return ret;
edbd8d4e
CM
5861}
5862
d397712b 5863static noinline int relocate_data_extent(struct inode *reloc_inode,
1a40e23b
ZY
5864 struct btrfs_key *extent_key,
5865 u64 offset)
5866{
5867 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5868 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
5869 struct extent_map *em;
6643558d
YZ
5870 u64 start = extent_key->objectid - offset;
5871 u64 end = start + extent_key->offset - 1;
bf4ef679 5872
1a40e23b
ZY
5873 em = alloc_extent_map(GFP_NOFS);
5874 BUG_ON(!em || IS_ERR(em));
bf4ef679 5875
6643558d 5876 em->start = start;
1a40e23b 5877 em->len = extent_key->offset;
c8b97818 5878 em->block_len = extent_key->offset;
1a40e23b
ZY
5879 em->block_start = extent_key->objectid;
5880 em->bdev = root->fs_info->fs_devices->latest_bdev;
5881 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5882
5883 /* setup extent map to cheat btrfs_readpage */
6643558d 5884 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
1a40e23b
ZY
5885 while (1) {
5886 int ret;
890871be 5887 write_lock(&em_tree->lock);
1a40e23b 5888 ret = add_extent_mapping(em_tree, em);
890871be 5889 write_unlock(&em_tree->lock);
1a40e23b
ZY
5890 if (ret != -EEXIST) {
5891 free_extent_map(em);
bf4ef679
CM
5892 break;
5893 }
6643558d 5894 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
bf4ef679 5895 }
6643558d 5896 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
bf4ef679 5897
6643558d 5898 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
1a40e23b 5899}
edbd8d4e 5900
1a40e23b
ZY
5901struct btrfs_ref_path {
5902 u64 extent_start;
5903 u64 nodes[BTRFS_MAX_LEVEL];
5904 u64 root_objectid;
5905 u64 root_generation;
5906 u64 owner_objectid;
1a40e23b
ZY
5907 u32 num_refs;
5908 int lowest_level;
5909 int current_level;
f82d02d9
YZ
5910 int shared_level;
5911
5912 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
5913 u64 new_nodes[BTRFS_MAX_LEVEL];
1a40e23b 5914};
7d9eb12c 5915
1a40e23b 5916struct disk_extent {
c8b97818 5917 u64 ram_bytes;
1a40e23b
ZY
5918 u64 disk_bytenr;
5919 u64 disk_num_bytes;
5920 u64 offset;
5921 u64 num_bytes;
c8b97818
CM
5922 u8 compression;
5923 u8 encryption;
5924 u16 other_encoding;
1a40e23b 5925};
4313b399 5926
1a40e23b
ZY
5927static int is_cowonly_root(u64 root_objectid)
5928{
5929 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
5930 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
5931 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
5932 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
0403e47e
YZ
5933 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5934 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
1a40e23b
ZY
5935 return 1;
5936 return 0;
5937}
edbd8d4e 5938
d397712b 5939static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
1a40e23b
ZY
5940 struct btrfs_root *extent_root,
5941 struct btrfs_ref_path *ref_path,
5942 int first_time)
5943{
5944 struct extent_buffer *leaf;
5945 struct btrfs_path *path;
5946 struct btrfs_extent_ref *ref;
5947 struct btrfs_key key;
5948 struct btrfs_key found_key;
5949 u64 bytenr;
5950 u32 nritems;
5951 int level;
5952 int ret = 1;
edbd8d4e 5953
1a40e23b
ZY
5954 path = btrfs_alloc_path();
5955 if (!path)
5956 return -ENOMEM;
bf4ef679 5957
1a40e23b
ZY
5958 if (first_time) {
5959 ref_path->lowest_level = -1;
5960 ref_path->current_level = -1;
f82d02d9 5961 ref_path->shared_level = -1;
1a40e23b
ZY
5962 goto walk_up;
5963 }
5964walk_down:
5965 level = ref_path->current_level - 1;
5966 while (level >= -1) {
5967 u64 parent;
5968 if (level < ref_path->lowest_level)
5969 break;
bf4ef679 5970
d397712b 5971 if (level >= 0)
1a40e23b 5972 bytenr = ref_path->nodes[level];
d397712b 5973 else
1a40e23b 5974 bytenr = ref_path->extent_start;
1a40e23b 5975 BUG_ON(bytenr == 0);
bf4ef679 5976
1a40e23b
ZY
5977 parent = ref_path->nodes[level + 1];
5978 ref_path->nodes[level + 1] = 0;
5979 ref_path->current_level = level;
5980 BUG_ON(parent == 0);
0ef3e66b 5981
1a40e23b
ZY
5982 key.objectid = bytenr;
5983 key.offset = parent + 1;
5984 key.type = BTRFS_EXTENT_REF_KEY;
edbd8d4e 5985
1a40e23b
ZY
5986 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5987 if (ret < 0)
edbd8d4e 5988 goto out;
1a40e23b 5989 BUG_ON(ret == 0);
7d9eb12c 5990
1a40e23b
ZY
5991 leaf = path->nodes[0];
5992 nritems = btrfs_header_nritems(leaf);
5993 if (path->slots[0] >= nritems) {
5994 ret = btrfs_next_leaf(extent_root, path);
5995 if (ret < 0)
5996 goto out;
5997 if (ret > 0)
5998 goto next;
5999 leaf = path->nodes[0];
6000 }
0ef3e66b 6001
1a40e23b
ZY
6002 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6003 if (found_key.objectid == bytenr &&
f82d02d9
YZ
6004 found_key.type == BTRFS_EXTENT_REF_KEY) {
6005 if (level < ref_path->shared_level)
6006 ref_path->shared_level = level;
1a40e23b 6007 goto found;
f82d02d9 6008 }
1a40e23b
ZY
6009next:
6010 level--;
6011 btrfs_release_path(extent_root, path);
d899e052 6012 cond_resched();
1a40e23b
ZY
6013 }
6014 /* reached lowest level */
6015 ret = 1;
6016 goto out;
6017walk_up:
6018 level = ref_path->current_level;
6019 while (level < BTRFS_MAX_LEVEL - 1) {
6020 u64 ref_objectid;
d397712b
CM
6021
6022 if (level >= 0)
1a40e23b 6023 bytenr = ref_path->nodes[level];
d397712b 6024 else
1a40e23b 6025 bytenr = ref_path->extent_start;
d397712b 6026
1a40e23b 6027 BUG_ON(bytenr == 0);
edbd8d4e 6028
1a40e23b
ZY
6029 key.objectid = bytenr;
6030 key.offset = 0;
6031 key.type = BTRFS_EXTENT_REF_KEY;
edbd8d4e 6032
1a40e23b
ZY
6033 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6034 if (ret < 0)
6035 goto out;
edbd8d4e 6036
1a40e23b
ZY
6037 leaf = path->nodes[0];
6038 nritems = btrfs_header_nritems(leaf);
6039 if (path->slots[0] >= nritems) {
6040 ret = btrfs_next_leaf(extent_root, path);
6041 if (ret < 0)
6042 goto out;
6043 if (ret > 0) {
6044 /* the extent was freed by someone */
6045 if (ref_path->lowest_level == level)
6046 goto out;
6047 btrfs_release_path(extent_root, path);
6048 goto walk_down;
6049 }
6050 leaf = path->nodes[0];
6051 }
edbd8d4e 6052
1a40e23b
ZY
6053 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6054 if (found_key.objectid != bytenr ||
6055 found_key.type != BTRFS_EXTENT_REF_KEY) {
6056 /* the extent was freed by someone */
6057 if (ref_path->lowest_level == level) {
6058 ret = 1;
6059 goto out;
6060 }
6061 btrfs_release_path(extent_root, path);
6062 goto walk_down;
6063 }
6064found:
6065 ref = btrfs_item_ptr(leaf, path->slots[0],
6066 struct btrfs_extent_ref);
6067 ref_objectid = btrfs_ref_objectid(leaf, ref);
6068 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6069 if (first_time) {
6070 level = (int)ref_objectid;
6071 BUG_ON(level >= BTRFS_MAX_LEVEL);
6072 ref_path->lowest_level = level;
6073 ref_path->current_level = level;
6074 ref_path->nodes[level] = bytenr;
6075 } else {
6076 WARN_ON(ref_objectid != level);
6077 }
6078 } else {
6079 WARN_ON(level != -1);
6080 }
6081 first_time = 0;
bf4ef679 6082
1a40e23b
ZY
6083 if (ref_path->lowest_level == level) {
6084 ref_path->owner_objectid = ref_objectid;
1a40e23b
ZY
6085 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6086 }
bf4ef679 6087
7d9eb12c 6088 /*
1a40e23b
ZY
6089 * the block is tree root or the block isn't in reference
6090 * counted tree.
7d9eb12c 6091 */
1a40e23b
ZY
6092 if (found_key.objectid == found_key.offset ||
6093 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6094 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6095 ref_path->root_generation =
6096 btrfs_ref_generation(leaf, ref);
6097 if (level < 0) {
6098 /* special reference from the tree log */
6099 ref_path->nodes[0] = found_key.offset;
6100 ref_path->current_level = 0;
6101 }
6102 ret = 0;
6103 goto out;
6104 }
7d9eb12c 6105
1a40e23b
ZY
6106 level++;
6107 BUG_ON(ref_path->nodes[level] != 0);
6108 ref_path->nodes[level] = found_key.offset;
6109 ref_path->current_level = level;
bf4ef679 6110
1a40e23b
ZY
6111 /*
6112 * the reference was created in the running transaction,
6113 * no need to continue walking up.
6114 */
6115 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6116 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6117 ref_path->root_generation =
6118 btrfs_ref_generation(leaf, ref);
6119 ret = 0;
6120 goto out;
7d9eb12c
CM
6121 }
6122
1a40e23b 6123 btrfs_release_path(extent_root, path);
d899e052 6124 cond_resched();
7d9eb12c 6125 }
1a40e23b
ZY
6126 /* reached max tree level, but no tree root found. */
6127 BUG();
edbd8d4e 6128out:
1a40e23b
ZY
6129 btrfs_free_path(path);
6130 return ret;
edbd8d4e
CM
6131}
6132
1a40e23b
ZY
6133static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6134 struct btrfs_root *extent_root,
6135 struct btrfs_ref_path *ref_path,
6136 u64 extent_start)
a061fc8d 6137{
1a40e23b
ZY
6138 memset(ref_path, 0, sizeof(*ref_path));
6139 ref_path->extent_start = extent_start;
a061fc8d 6140
1a40e23b 6141 return __next_ref_path(trans, extent_root, ref_path, 1);
a061fc8d
CM
6142}
6143
1a40e23b
ZY
6144static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6145 struct btrfs_root *extent_root,
6146 struct btrfs_ref_path *ref_path)
edbd8d4e 6147{
1a40e23b
ZY
6148 return __next_ref_path(trans, extent_root, ref_path, 0);
6149}
6150
d397712b 6151static noinline int get_new_locations(struct inode *reloc_inode,
1a40e23b
ZY
6152 struct btrfs_key *extent_key,
6153 u64 offset, int no_fragment,
6154 struct disk_extent **extents,
6155 int *nr_extents)
6156{
6157 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6158 struct btrfs_path *path;
6159 struct btrfs_file_extent_item *fi;
edbd8d4e 6160 struct extent_buffer *leaf;
1a40e23b
ZY
6161 struct disk_extent *exts = *extents;
6162 struct btrfs_key found_key;
6163 u64 cur_pos;
6164 u64 last_byte;
edbd8d4e 6165 u32 nritems;
1a40e23b
ZY
6166 int nr = 0;
6167 int max = *nr_extents;
6168 int ret;
edbd8d4e 6169
1a40e23b
ZY
6170 WARN_ON(!no_fragment && *extents);
6171 if (!exts) {
6172 max = 1;
6173 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6174 if (!exts)
6175 return -ENOMEM;
a061fc8d 6176 }
edbd8d4e 6177
1a40e23b
ZY
6178 path = btrfs_alloc_path();
6179 BUG_ON(!path);
edbd8d4e 6180
1a40e23b
ZY
6181 cur_pos = extent_key->objectid - offset;
6182 last_byte = extent_key->objectid + extent_key->offset;
6183 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6184 cur_pos, 0);
6185 if (ret < 0)
6186 goto out;
6187 if (ret > 0) {
6188 ret = -ENOENT;
6189 goto out;
6190 }
edbd8d4e 6191
1a40e23b 6192 while (1) {
edbd8d4e
CM
6193 leaf = path->nodes[0];
6194 nritems = btrfs_header_nritems(leaf);
1a40e23b
ZY
6195 if (path->slots[0] >= nritems) {
6196 ret = btrfs_next_leaf(root, path);
a061fc8d
CM
6197 if (ret < 0)
6198 goto out;
1a40e23b
ZY
6199 if (ret > 0)
6200 break;
bf4ef679 6201 leaf = path->nodes[0];
a061fc8d 6202 }
edbd8d4e
CM
6203
6204 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1a40e23b
ZY
6205 if (found_key.offset != cur_pos ||
6206 found_key.type != BTRFS_EXTENT_DATA_KEY ||
6207 found_key.objectid != reloc_inode->i_ino)
edbd8d4e
CM
6208 break;
6209
1a40e23b
ZY
6210 fi = btrfs_item_ptr(leaf, path->slots[0],
6211 struct btrfs_file_extent_item);
6212 if (btrfs_file_extent_type(leaf, fi) !=
6213 BTRFS_FILE_EXTENT_REG ||
6214 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
edbd8d4e 6215 break;
1a40e23b
ZY
6216
6217 if (nr == max) {
6218 struct disk_extent *old = exts;
6219 max *= 2;
6220 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6221 memcpy(exts, old, sizeof(*exts) * nr);
6222 if (old != *extents)
6223 kfree(old);
a061fc8d 6224 }
edbd8d4e 6225
1a40e23b
ZY
6226 exts[nr].disk_bytenr =
6227 btrfs_file_extent_disk_bytenr(leaf, fi);
6228 exts[nr].disk_num_bytes =
6229 btrfs_file_extent_disk_num_bytes(leaf, fi);
6230 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6231 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
c8b97818
CM
6232 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6233 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6234 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6235 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6236 fi);
d899e052
YZ
6237 BUG_ON(exts[nr].offset > 0);
6238 BUG_ON(exts[nr].compression || exts[nr].encryption);
6239 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
edbd8d4e 6240
1a40e23b
ZY
6241 cur_pos += exts[nr].num_bytes;
6242 nr++;
6243
6244 if (cur_pos + offset >= last_byte)
6245 break;
6246
6247 if (no_fragment) {
6248 ret = 1;
edbd8d4e 6249 goto out;
1a40e23b
ZY
6250 }
6251 path->slots[0]++;
6252 }
6253
1f80e4db 6254 BUG_ON(cur_pos + offset > last_byte);
1a40e23b
ZY
6255 if (cur_pos + offset < last_byte) {
6256 ret = -ENOENT;
6257 goto out;
edbd8d4e
CM
6258 }
6259 ret = 0;
6260out:
1a40e23b
ZY
6261 btrfs_free_path(path);
6262 if (ret) {
6263 if (exts != *extents)
6264 kfree(exts);
6265 } else {
6266 *extents = exts;
6267 *nr_extents = nr;
6268 }
6269 return ret;
6270}
6271
d397712b 6272static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
1a40e23b
ZY
6273 struct btrfs_root *root,
6274 struct btrfs_path *path,
6275 struct btrfs_key *extent_key,
6276 struct btrfs_key *leaf_key,
6277 struct btrfs_ref_path *ref_path,
6278 struct disk_extent *new_extents,
6279 int nr_extents)
6280{
6281 struct extent_buffer *leaf;
6282 struct btrfs_file_extent_item *fi;
6283 struct inode *inode = NULL;
6284 struct btrfs_key key;
6285 u64 lock_start = 0;
6286 u64 lock_end = 0;
6287 u64 num_bytes;
6288 u64 ext_offset;
86288a19 6289 u64 search_end = (u64)-1;
1a40e23b 6290 u32 nritems;
3bb1a1bc 6291 int nr_scaned = 0;
1a40e23b 6292 int extent_locked = 0;
d899e052 6293 int extent_type;
1a40e23b
ZY
6294 int ret;
6295
3bb1a1bc 6296 memcpy(&key, leaf_key, sizeof(key));
1a40e23b 6297 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
3bb1a1bc
YZ
6298 if (key.objectid < ref_path->owner_objectid ||
6299 (key.objectid == ref_path->owner_objectid &&
6300 key.type < BTRFS_EXTENT_DATA_KEY)) {
6301 key.objectid = ref_path->owner_objectid;
6302 key.type = BTRFS_EXTENT_DATA_KEY;
6303 key.offset = 0;
6304 }
1a40e23b
ZY
6305 }
6306
6307 while (1) {
6308 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6309 if (ret < 0)
6310 goto out;
6311
6312 leaf = path->nodes[0];
6313 nritems = btrfs_header_nritems(leaf);
6314next:
6315 if (extent_locked && ret > 0) {
6316 /*
6317 * the file extent item was modified by someone
6318 * before the extent got locked.
6319 */
1a40e23b
ZY
6320 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6321 lock_end, GFP_NOFS);
6322 extent_locked = 0;
6323 }
6324
6325 if (path->slots[0] >= nritems) {
3bb1a1bc 6326 if (++nr_scaned > 2)
1a40e23b
ZY
6327 break;
6328
6329 BUG_ON(extent_locked);
6330 ret = btrfs_next_leaf(root, path);
6331 if (ret < 0)
6332 goto out;
6333 if (ret > 0)
6334 break;
6335 leaf = path->nodes[0];
6336 nritems = btrfs_header_nritems(leaf);
6337 }
6338
6339 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6340
6341 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6342 if ((key.objectid > ref_path->owner_objectid) ||
6343 (key.objectid == ref_path->owner_objectid &&
6344 key.type > BTRFS_EXTENT_DATA_KEY) ||
86288a19 6345 key.offset >= search_end)
1a40e23b
ZY
6346 break;
6347 }
6348
6349 if (inode && key.objectid != inode->i_ino) {
6350 BUG_ON(extent_locked);
6351 btrfs_release_path(root, path);
6352 mutex_unlock(&inode->i_mutex);
6353 iput(inode);
6354 inode = NULL;
6355 continue;
6356 }
6357
6358 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6359 path->slots[0]++;
6360 ret = 1;
6361 goto next;
6362 }
6363 fi = btrfs_item_ptr(leaf, path->slots[0],
6364 struct btrfs_file_extent_item);
d899e052
YZ
6365 extent_type = btrfs_file_extent_type(leaf, fi);
6366 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6367 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
1a40e23b
ZY
6368 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6369 extent_key->objectid)) {
6370 path->slots[0]++;
6371 ret = 1;
6372 goto next;
6373 }
6374
6375 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6376 ext_offset = btrfs_file_extent_offset(leaf, fi);
6377
86288a19
YZ
6378 if (search_end == (u64)-1) {
6379 search_end = key.offset - ext_offset +
6380 btrfs_file_extent_ram_bytes(leaf, fi);
6381 }
1a40e23b
ZY
6382
6383 if (!extent_locked) {
6384 lock_start = key.offset;
6385 lock_end = lock_start + num_bytes - 1;
6386 } else {
6643558d
YZ
6387 if (lock_start > key.offset ||
6388 lock_end + 1 < key.offset + num_bytes) {
6389 unlock_extent(&BTRFS_I(inode)->io_tree,
6390 lock_start, lock_end, GFP_NOFS);
6391 extent_locked = 0;
6392 }
1a40e23b
ZY
6393 }
6394
6395 if (!inode) {
6396 btrfs_release_path(root, path);
6397
6398 inode = btrfs_iget_locked(root->fs_info->sb,
6399 key.objectid, root);
6400 if (inode->i_state & I_NEW) {
6401 BTRFS_I(inode)->root = root;
6402 BTRFS_I(inode)->location.objectid =
6403 key.objectid;
6404 BTRFS_I(inode)->location.type =
6405 BTRFS_INODE_ITEM_KEY;
6406 BTRFS_I(inode)->location.offset = 0;
6407 btrfs_read_locked_inode(inode);
6408 unlock_new_inode(inode);
6409 }
6410 /*
6411 * some code call btrfs_commit_transaction while
6412 * holding the i_mutex, so we can't use mutex_lock
6413 * here.
6414 */
6415 if (is_bad_inode(inode) ||
6416 !mutex_trylock(&inode->i_mutex)) {
6417 iput(inode);
6418 inode = NULL;
6419 key.offset = (u64)-1;
6420 goto skip;
6421 }
6422 }
6423
6424 if (!extent_locked) {
6425 struct btrfs_ordered_extent *ordered;
6426
6427 btrfs_release_path(root, path);
6428
6429 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6430 lock_end, GFP_NOFS);
6431 ordered = btrfs_lookup_first_ordered_extent(inode,
6432 lock_end);
6433 if (ordered &&
6434 ordered->file_offset <= lock_end &&
6435 ordered->file_offset + ordered->len > lock_start) {
6436 unlock_extent(&BTRFS_I(inode)->io_tree,
6437 lock_start, lock_end, GFP_NOFS);
6438 btrfs_start_ordered_extent(inode, ordered, 1);
6439 btrfs_put_ordered_extent(ordered);
6440 key.offset += num_bytes;
6441 goto skip;
6442 }
6443 if (ordered)
6444 btrfs_put_ordered_extent(ordered);
6445
1a40e23b
ZY
6446 extent_locked = 1;
6447 continue;
6448 }
6449
6450 if (nr_extents == 1) {
6451 /* update extent pointer in place */
1a40e23b
ZY
6452 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6453 new_extents[0].disk_bytenr);
6454 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6455 new_extents[0].disk_num_bytes);
1a40e23b
ZY
6456 btrfs_mark_buffer_dirty(leaf);
6457
6458 btrfs_drop_extent_cache(inode, key.offset,
6459 key.offset + num_bytes - 1, 0);
6460
6461 ret = btrfs_inc_extent_ref(trans, root,
6462 new_extents[0].disk_bytenr,
6463 new_extents[0].disk_num_bytes,
6464 leaf->start,
6465 root->root_key.objectid,
6466 trans->transid,
3bb1a1bc 6467 key.objectid);
1a40e23b
ZY
6468 BUG_ON(ret);
6469
6470 ret = btrfs_free_extent(trans, root,
6471 extent_key->objectid,
6472 extent_key->offset,
6473 leaf->start,
6474 btrfs_header_owner(leaf),
6475 btrfs_header_generation(leaf),
3bb1a1bc 6476 key.objectid, 0);
1a40e23b
ZY
6477 BUG_ON(ret);
6478
6479 btrfs_release_path(root, path);
6480 key.offset += num_bytes;
6481 } else {
d899e052
YZ
6482 BUG_ON(1);
6483#if 0
1a40e23b
ZY
6484 u64 alloc_hint;
6485 u64 extent_len;
6486 int i;
6487 /*
6488 * drop old extent pointer at first, then insert the
6489 * new pointers one bye one
6490 */
6491 btrfs_release_path(root, path);
6492 ret = btrfs_drop_extents(trans, root, inode, key.offset,
6493 key.offset + num_bytes,
6494 key.offset, &alloc_hint);
6495 BUG_ON(ret);
6496
6497 for (i = 0; i < nr_extents; i++) {
6498 if (ext_offset >= new_extents[i].num_bytes) {
6499 ext_offset -= new_extents[i].num_bytes;
6500 continue;
6501 }
6502 extent_len = min(new_extents[i].num_bytes -
6503 ext_offset, num_bytes);
6504
6505 ret = btrfs_insert_empty_item(trans, root,
6506 path, &key,
6507 sizeof(*fi));
6508 BUG_ON(ret);
6509
6510 leaf = path->nodes[0];
6511 fi = btrfs_item_ptr(leaf, path->slots[0],
6512 struct btrfs_file_extent_item);
6513 btrfs_set_file_extent_generation(leaf, fi,
6514 trans->transid);
6515 btrfs_set_file_extent_type(leaf, fi,
6516 BTRFS_FILE_EXTENT_REG);
6517 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6518 new_extents[i].disk_bytenr);
6519 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6520 new_extents[i].disk_num_bytes);
c8b97818
CM
6521 btrfs_set_file_extent_ram_bytes(leaf, fi,
6522 new_extents[i].ram_bytes);
6523
6524 btrfs_set_file_extent_compression(leaf, fi,
6525 new_extents[i].compression);
6526 btrfs_set_file_extent_encryption(leaf, fi,
6527 new_extents[i].encryption);
6528 btrfs_set_file_extent_other_encoding(leaf, fi,
6529 new_extents[i].other_encoding);
6530
1a40e23b
ZY
6531 btrfs_set_file_extent_num_bytes(leaf, fi,
6532 extent_len);
6533 ext_offset += new_extents[i].offset;
6534 btrfs_set_file_extent_offset(leaf, fi,
6535 ext_offset);
6536 btrfs_mark_buffer_dirty(leaf);
6537
6538 btrfs_drop_extent_cache(inode, key.offset,
6539 key.offset + extent_len - 1, 0);
6540
6541 ret = btrfs_inc_extent_ref(trans, root,
6542 new_extents[i].disk_bytenr,
6543 new_extents[i].disk_num_bytes,
6544 leaf->start,
6545 root->root_key.objectid,
3bb1a1bc 6546 trans->transid, key.objectid);
1a40e23b
ZY
6547 BUG_ON(ret);
6548 btrfs_release_path(root, path);
6549
a76a3cd4 6550 inode_add_bytes(inode, extent_len);
1a40e23b
ZY
6551
6552 ext_offset = 0;
6553 num_bytes -= extent_len;
6554 key.offset += extent_len;
6555
6556 if (num_bytes == 0)
6557 break;
6558 }
6559 BUG_ON(i >= nr_extents);
d899e052 6560#endif
1a40e23b
ZY
6561 }
6562
6563 if (extent_locked) {
1a40e23b
ZY
6564 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6565 lock_end, GFP_NOFS);
6566 extent_locked = 0;
6567 }
6568skip:
6569 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
86288a19 6570 key.offset >= search_end)
1a40e23b
ZY
6571 break;
6572
6573 cond_resched();
6574 }
6575 ret = 0;
6576out:
6577 btrfs_release_path(root, path);
6578 if (inode) {
6579 mutex_unlock(&inode->i_mutex);
6580 if (extent_locked) {
1a40e23b
ZY
6581 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6582 lock_end, GFP_NOFS);
6583 }
6584 iput(inode);
6585 }
6586 return ret;
6587}
6588
1a40e23b
ZY
6589int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6590 struct btrfs_root *root,
6591 struct extent_buffer *buf, u64 orig_start)
6592{
6593 int level;
6594 int ret;
6595
6596 BUG_ON(btrfs_header_generation(buf) != trans->transid);
6597 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6598
6599 level = btrfs_header_level(buf);
6600 if (level == 0) {
6601 struct btrfs_leaf_ref *ref;
6602 struct btrfs_leaf_ref *orig_ref;
6603
6604 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6605 if (!orig_ref)
6606 return -ENOENT;
6607
6608 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6609 if (!ref) {
6610 btrfs_free_leaf_ref(root, orig_ref);
6611 return -ENOMEM;
6612 }
6613
6614 ref->nritems = orig_ref->nritems;
6615 memcpy(ref->extents, orig_ref->extents,
6616 sizeof(ref->extents[0]) * ref->nritems);
6617
6618 btrfs_free_leaf_ref(root, orig_ref);
6619
6620 ref->root_gen = trans->transid;
6621 ref->bytenr = buf->start;
6622 ref->owner = btrfs_header_owner(buf);
6623 ref->generation = btrfs_header_generation(buf);
bd56b302 6624
1a40e23b
ZY
6625 ret = btrfs_add_leaf_ref(root, ref, 0);
6626 WARN_ON(ret);
6627 btrfs_free_leaf_ref(root, ref);
6628 }
6629 return 0;
6630}
6631
d397712b 6632static noinline int invalidate_extent_cache(struct btrfs_root *root,
1a40e23b
ZY
6633 struct extent_buffer *leaf,
6634 struct btrfs_block_group_cache *group,
6635 struct btrfs_root *target_root)
6636{
6637 struct btrfs_key key;
6638 struct inode *inode = NULL;
6639 struct btrfs_file_extent_item *fi;
2ac55d41 6640 struct extent_state *cached_state = NULL;
1a40e23b
ZY
6641 u64 num_bytes;
6642 u64 skip_objectid = 0;
6643 u32 nritems;
6644 u32 i;
6645
6646 nritems = btrfs_header_nritems(leaf);
6647 for (i = 0; i < nritems; i++) {
6648 btrfs_item_key_to_cpu(leaf, &key, i);
6649 if (key.objectid == skip_objectid ||
6650 key.type != BTRFS_EXTENT_DATA_KEY)
6651 continue;
6652 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6653 if (btrfs_file_extent_type(leaf, fi) ==
6654 BTRFS_FILE_EXTENT_INLINE)
6655 continue;
6656 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6657 continue;
6658 if (!inode || inode->i_ino != key.objectid) {
6659 iput(inode);
6660 inode = btrfs_ilookup(target_root->fs_info->sb,
6661 key.objectid, target_root, 1);
6662 }
6663 if (!inode) {
6664 skip_objectid = key.objectid;
6665 continue;
6666 }
6667 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6668
2ac55d41
JB
6669 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6670 key.offset + num_bytes - 1, 0, &cached_state,
6671 GFP_NOFS);
1a40e23b
ZY
6672 btrfs_drop_extent_cache(inode, key.offset,
6673 key.offset + num_bytes - 1, 1);
2ac55d41
JB
6674 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6675 key.offset + num_bytes - 1, &cached_state,
6676 GFP_NOFS);
1a40e23b
ZY
6677 cond_resched();
6678 }
6679 iput(inode);
6680 return 0;
6681}
6682
d397712b 6683static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
1a40e23b
ZY
6684 struct btrfs_root *root,
6685 struct extent_buffer *leaf,
6686 struct btrfs_block_group_cache *group,
6687 struct inode *reloc_inode)
6688{
6689 struct btrfs_key key;
6690 struct btrfs_key extent_key;
6691 struct btrfs_file_extent_item *fi;
6692 struct btrfs_leaf_ref *ref;
6693 struct disk_extent *new_extent;
6694 u64 bytenr;
6695 u64 num_bytes;
6696 u32 nritems;
6697 u32 i;
6698 int ext_index;
6699 int nr_extent;
6700 int ret;
6701
6702 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
6703 BUG_ON(!new_extent);
6704
6705 ref = btrfs_lookup_leaf_ref(root, leaf->start);
6706 BUG_ON(!ref);
6707
6708 ext_index = -1;
6709 nritems = btrfs_header_nritems(leaf);
6710 for (i = 0; i < nritems; i++) {
6711 btrfs_item_key_to_cpu(leaf, &key, i);
6712 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6713 continue;
6714 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6715 if (btrfs_file_extent_type(leaf, fi) ==
6716 BTRFS_FILE_EXTENT_INLINE)
6717 continue;
6718 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6719 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6720 if (bytenr == 0)
6721 continue;
6722
6723 ext_index++;
6724 if (bytenr >= group->key.objectid + group->key.offset ||
6725 bytenr + num_bytes <= group->key.objectid)
6726 continue;
6727
6728 extent_key.objectid = bytenr;
6729 extent_key.offset = num_bytes;
6730 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6731 nr_extent = 1;
6732 ret = get_new_locations(reloc_inode, &extent_key,
6733 group->key.objectid, 1,
6734 &new_extent, &nr_extent);
6735 if (ret > 0)
6736 continue;
6737 BUG_ON(ret < 0);
6738
6739 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
6740 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
6741 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
6742 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
6743
1a40e23b
ZY
6744 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6745 new_extent->disk_bytenr);
6746 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6747 new_extent->disk_num_bytes);
1a40e23b
ZY
6748 btrfs_mark_buffer_dirty(leaf);
6749
6750 ret = btrfs_inc_extent_ref(trans, root,
6751 new_extent->disk_bytenr,
6752 new_extent->disk_num_bytes,
6753 leaf->start,
6754 root->root_key.objectid,
3bb1a1bc 6755 trans->transid, key.objectid);
1a40e23b 6756 BUG_ON(ret);
56bec294 6757
1a40e23b
ZY
6758 ret = btrfs_free_extent(trans, root,
6759 bytenr, num_bytes, leaf->start,
6760 btrfs_header_owner(leaf),
6761 btrfs_header_generation(leaf),
3bb1a1bc 6762 key.objectid, 0);
1a40e23b
ZY
6763 BUG_ON(ret);
6764 cond_resched();
6765 }
6766 kfree(new_extent);
6767 BUG_ON(ext_index + 1 != ref->nritems);
6768 btrfs_free_leaf_ref(root, ref);
6769 return 0;
6770}
6771
f82d02d9
YZ
6772int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
6773 struct btrfs_root *root)
1a40e23b
ZY
6774{
6775 struct btrfs_root *reloc_root;
f82d02d9 6776 int ret;
1a40e23b
ZY
6777
6778 if (root->reloc_root) {
6779 reloc_root = root->reloc_root;
6780 root->reloc_root = NULL;
6781 list_add(&reloc_root->dead_list,
6782 &root->fs_info->dead_reloc_roots);
f82d02d9
YZ
6783
6784 btrfs_set_root_bytenr(&reloc_root->root_item,
6785 reloc_root->node->start);
6786 btrfs_set_root_level(&root->root_item,
6787 btrfs_header_level(reloc_root->node));
6788 memset(&reloc_root->root_item.drop_progress, 0,
6789 sizeof(struct btrfs_disk_key));
6790 reloc_root->root_item.drop_level = 0;
6791
6792 ret = btrfs_update_root(trans, root->fs_info->tree_root,
6793 &reloc_root->root_key,
6794 &reloc_root->root_item);
6795 BUG_ON(ret);
1a40e23b
ZY
6796 }
6797 return 0;
6798}
6799
6800int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
6801{
6802 struct btrfs_trans_handle *trans;
6803 struct btrfs_root *reloc_root;
6804 struct btrfs_root *prev_root = NULL;
6805 struct list_head dead_roots;
6806 int ret;
6807 unsigned long nr;
6808
6809 INIT_LIST_HEAD(&dead_roots);
6810 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
6811
6812 while (!list_empty(&dead_roots)) {
6813 reloc_root = list_entry(dead_roots.prev,
6814 struct btrfs_root, dead_list);
6815 list_del_init(&reloc_root->dead_list);
6816
6817 BUG_ON(reloc_root->commit_root != NULL);
6818 while (1) {
6819 trans = btrfs_join_transaction(root, 1);
6820 BUG_ON(!trans);
6821
6822 mutex_lock(&root->fs_info->drop_mutex);
6823 ret = btrfs_drop_snapshot(trans, reloc_root);
6824 if (ret != -EAGAIN)
6825 break;
6826 mutex_unlock(&root->fs_info->drop_mutex);
6827
6828 nr = trans->blocks_used;
6829 ret = btrfs_end_transaction(trans, root);
6830 BUG_ON(ret);
6831 btrfs_btree_balance_dirty(root, nr);
6832 }
6833
6834 free_extent_buffer(reloc_root->node);
6835
6836 ret = btrfs_del_root(trans, root->fs_info->tree_root,
6837 &reloc_root->root_key);
6838 BUG_ON(ret);
6839 mutex_unlock(&root->fs_info->drop_mutex);
6840
6841 nr = trans->blocks_used;
6842 ret = btrfs_end_transaction(trans, root);
6843 BUG_ON(ret);
6844 btrfs_btree_balance_dirty(root, nr);
6845
6846 kfree(prev_root);
6847 prev_root = reloc_root;
6848 }
6849 if (prev_root) {
6850 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
6851 kfree(prev_root);
6852 }
6853 return 0;
6854}
6855
6856int btrfs_add_dead_reloc_root(struct btrfs_root *root)
6857{
6858 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
6859 return 0;
6860}
6861
6862int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
6863{
6864 struct btrfs_root *reloc_root;
6865 struct btrfs_trans_handle *trans;
6866 struct btrfs_key location;
6867 int found;
6868 int ret;
6869
6870 mutex_lock(&root->fs_info->tree_reloc_mutex);
6871 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
6872 BUG_ON(ret);
6873 found = !list_empty(&root->fs_info->dead_reloc_roots);
6874 mutex_unlock(&root->fs_info->tree_reloc_mutex);
6875
6876 if (found) {
6877 trans = btrfs_start_transaction(root, 1);
6878 BUG_ON(!trans);
6879 ret = btrfs_commit_transaction(trans, root);
6880 BUG_ON(ret);
6881 }
6882
6883 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6884 location.offset = (u64)-1;
6885 location.type = BTRFS_ROOT_ITEM_KEY;
6886
6887 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
6888 BUG_ON(!reloc_root);
6889 btrfs_orphan_cleanup(reloc_root);
6890 return 0;
6891}
6892
d397712b 6893static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
1a40e23b
ZY
6894 struct btrfs_root *root)
6895{
6896 struct btrfs_root *reloc_root;
6897 struct extent_buffer *eb;
6898 struct btrfs_root_item *root_item;
6899 struct btrfs_key root_key;
6900 int ret;
6901
6902 BUG_ON(!root->ref_cows);
6903 if (root->reloc_root)
6904 return 0;
6905
6906 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
6907 BUG_ON(!root_item);
6908
6909 ret = btrfs_copy_root(trans, root, root->commit_root,
6910 &eb, BTRFS_TREE_RELOC_OBJECTID);
6911 BUG_ON(ret);
6912
6913 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
6914 root_key.offset = root->root_key.objectid;
6915 root_key.type = BTRFS_ROOT_ITEM_KEY;
6916
6917 memcpy(root_item, &root->root_item, sizeof(root_item));
6918 btrfs_set_root_refs(root_item, 0);
6919 btrfs_set_root_bytenr(root_item, eb->start);
6920 btrfs_set_root_level(root_item, btrfs_header_level(eb));
84234f3a 6921 btrfs_set_root_generation(root_item, trans->transid);
1a40e23b
ZY
6922
6923 btrfs_tree_unlock(eb);
6924 free_extent_buffer(eb);
6925
6926 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
6927 &root_key, root_item);
6928 BUG_ON(ret);
6929 kfree(root_item);
6930
6931 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
6932 &root_key);
6933 BUG_ON(!reloc_root);
6934 reloc_root->last_trans = trans->transid;
6935 reloc_root->commit_root = NULL;
6936 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
6937
6938 root->reloc_root = reloc_root;
6939 return 0;
6940}
6941
6942/*
6943 * Core function of space balance.
6944 *
6945 * The idea is using reloc trees to relocate tree blocks in reference
f82d02d9
YZ
6946 * counted roots. There is one reloc tree for each subvol, and all
6947 * reloc trees share same root key objectid. Reloc trees are snapshots
6948 * of the latest committed roots of subvols (root->commit_root).
6949 *
6950 * To relocate a tree block referenced by a subvol, there are two steps.
6951 * COW the block through subvol's reloc tree, then update block pointer
6952 * in the subvol to point to the new block. Since all reloc trees share
6953 * same root key objectid, doing special handing for tree blocks owned
6954 * by them is easy. Once a tree block has been COWed in one reloc tree,
6955 * we can use the resulting new block directly when the same block is
6956 * required to COW again through other reloc trees. By this way, relocated
6957 * tree blocks are shared between reloc trees, so they are also shared
6958 * between subvols.
1a40e23b 6959 */
d397712b 6960static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
1a40e23b
ZY
6961 struct btrfs_root *root,
6962 struct btrfs_path *path,
6963 struct btrfs_key *first_key,
6964 struct btrfs_ref_path *ref_path,
6965 struct btrfs_block_group_cache *group,
6966 struct inode *reloc_inode)
6967{
6968 struct btrfs_root *reloc_root;
6969 struct extent_buffer *eb = NULL;
6970 struct btrfs_key *keys;
6971 u64 *nodes;
6972 int level;
f82d02d9 6973 int shared_level;
1a40e23b 6974 int lowest_level = 0;
1a40e23b
ZY
6975 int ret;
6976
6977 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
6978 lowest_level = ref_path->owner_objectid;
6979
f82d02d9 6980 if (!root->ref_cows) {
1a40e23b
ZY
6981 path->lowest_level = lowest_level;
6982 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
6983 BUG_ON(ret < 0);
6984 path->lowest_level = 0;
6985 btrfs_release_path(root, path);
6986 return 0;
6987 }
6988
1a40e23b
ZY
6989 mutex_lock(&root->fs_info->tree_reloc_mutex);
6990 ret = init_reloc_tree(trans, root);
6991 BUG_ON(ret);
6992 reloc_root = root->reloc_root;
6993
f82d02d9
YZ
6994 shared_level = ref_path->shared_level;
6995 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
1a40e23b 6996
f82d02d9
YZ
6997 keys = ref_path->node_keys;
6998 nodes = ref_path->new_nodes;
6999 memset(&keys[shared_level + 1], 0,
7000 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7001 memset(&nodes[shared_level + 1], 0,
7002 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
1a40e23b 7003
f82d02d9
YZ
7004 if (nodes[lowest_level] == 0) {
7005 path->lowest_level = lowest_level;
7006 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7007 0, 1);
7008 BUG_ON(ret);
7009 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7010 eb = path->nodes[level];
7011 if (!eb || eb == reloc_root->node)
7012 break;
7013 nodes[level] = eb->start;
7014 if (level == 0)
7015 btrfs_item_key_to_cpu(eb, &keys[level], 0);
7016 else
7017 btrfs_node_key_to_cpu(eb, &keys[level], 0);
7018 }
2b82032c
YZ
7019 if (nodes[0] &&
7020 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
f82d02d9
YZ
7021 eb = path->nodes[0];
7022 ret = replace_extents_in_leaf(trans, reloc_root, eb,
7023 group, reloc_inode);
7024 BUG_ON(ret);
7025 }
7026 btrfs_release_path(reloc_root, path);
7027 } else {
1a40e23b 7028 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
f82d02d9 7029 lowest_level);
1a40e23b
ZY
7030 BUG_ON(ret);
7031 }
7032
1a40e23b
ZY
7033 /*
7034 * replace tree blocks in the fs tree with tree blocks in
7035 * the reloc tree.
7036 */
7037 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7038 BUG_ON(ret < 0);
7039
7040 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
f82d02d9
YZ
7041 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7042 0, 0);
7043 BUG_ON(ret);
7044 extent_buffer_get(path->nodes[0]);
7045 eb = path->nodes[0];
7046 btrfs_release_path(reloc_root, path);
1a40e23b
ZY
7047 ret = invalidate_extent_cache(reloc_root, eb, group, root);
7048 BUG_ON(ret);
7049 free_extent_buffer(eb);
7050 }
1a40e23b 7051
f82d02d9 7052 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1a40e23b 7053 path->lowest_level = 0;
1a40e23b
ZY
7054 return 0;
7055}
7056
d397712b 7057static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
1a40e23b
ZY
7058 struct btrfs_root *root,
7059 struct btrfs_path *path,
7060 struct btrfs_key *first_key,
7061 struct btrfs_ref_path *ref_path)
7062{
7063 int ret;
1a40e23b
ZY
7064
7065 ret = relocate_one_path(trans, root, path, first_key,
7066 ref_path, NULL, NULL);
7067 BUG_ON(ret);
7068
1a40e23b
ZY
7069 return 0;
7070}
7071
d397712b 7072static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
1a40e23b
ZY
7073 struct btrfs_root *extent_root,
7074 struct btrfs_path *path,
7075 struct btrfs_key *extent_key)
7076{
7077 int ret;
7078
1a40e23b
ZY
7079 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7080 if (ret)
7081 goto out;
7082 ret = btrfs_del_item(trans, extent_root, path);
7083out:
7084 btrfs_release_path(extent_root, path);
1a40e23b
ZY
7085 return ret;
7086}
7087
d397712b 7088static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
1a40e23b
ZY
7089 struct btrfs_ref_path *ref_path)
7090{
7091 struct btrfs_key root_key;
7092
7093 root_key.objectid = ref_path->root_objectid;
7094 root_key.type = BTRFS_ROOT_ITEM_KEY;
7095 if (is_cowonly_root(ref_path->root_objectid))
7096 root_key.offset = 0;
7097 else
7098 root_key.offset = (u64)-1;
7099
7100 return btrfs_read_fs_root_no_name(fs_info, &root_key);
7101}
7102
d397712b 7103static noinline int relocate_one_extent(struct btrfs_root *extent_root,
1a40e23b
ZY
7104 struct btrfs_path *path,
7105 struct btrfs_key *extent_key,
7106 struct btrfs_block_group_cache *group,
7107 struct inode *reloc_inode, int pass)
7108{
7109 struct btrfs_trans_handle *trans;
7110 struct btrfs_root *found_root;
7111 struct btrfs_ref_path *ref_path = NULL;
7112 struct disk_extent *new_extents = NULL;
7113 int nr_extents = 0;
7114 int loops;
7115 int ret;
7116 int level;
7117 struct btrfs_key first_key;
7118 u64 prev_block = 0;
7119
1a40e23b
ZY
7120
7121 trans = btrfs_start_transaction(extent_root, 1);
7122 BUG_ON(!trans);
7123
7124 if (extent_key->objectid == 0) {
7125 ret = del_extent_zero(trans, extent_root, path, extent_key);
7126 goto out;
7127 }
7128
7129 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7130 if (!ref_path) {
d397712b
CM
7131 ret = -ENOMEM;
7132 goto out;
1a40e23b
ZY
7133 }
7134
7135 for (loops = 0; ; loops++) {
7136 if (loops == 0) {
7137 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7138 extent_key->objectid);
7139 } else {
7140 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7141 }
7142 if (ret < 0)
7143 goto out;
7144 if (ret > 0)
7145 break;
7146
7147 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7148 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7149 continue;
7150
7151 found_root = read_ref_root(extent_root->fs_info, ref_path);
7152 BUG_ON(!found_root);
7153 /*
7154 * for reference counted tree, only process reference paths
7155 * rooted at the latest committed root.
7156 */
7157 if (found_root->ref_cows &&
7158 ref_path->root_generation != found_root->root_key.offset)
7159 continue;
7160
7161 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7162 if (pass == 0) {
7163 /*
7164 * copy data extents to new locations
7165 */
7166 u64 group_start = group->key.objectid;
7167 ret = relocate_data_extent(reloc_inode,
7168 extent_key,
7169 group_start);
7170 if (ret < 0)
7171 goto out;
7172 break;
7173 }
7174 level = 0;
7175 } else {
7176 level = ref_path->owner_objectid;
7177 }
7178
7179 if (prev_block != ref_path->nodes[level]) {
7180 struct extent_buffer *eb;
7181 u64 block_start = ref_path->nodes[level];
7182 u64 block_size = btrfs_level_size(found_root, level);
7183
7184 eb = read_tree_block(found_root, block_start,
7185 block_size, 0);
7186 btrfs_tree_lock(eb);
7187 BUG_ON(level != btrfs_header_level(eb));
7188
7189 if (level == 0)
7190 btrfs_item_key_to_cpu(eb, &first_key, 0);
7191 else
7192 btrfs_node_key_to_cpu(eb, &first_key, 0);
7193
7194 btrfs_tree_unlock(eb);
7195 free_extent_buffer(eb);
7196 prev_block = block_start;
7197 }
7198
24562425 7199 mutex_lock(&extent_root->fs_info->trans_mutex);
e4404d6e 7200 btrfs_record_root_in_trans(found_root);
24562425 7201 mutex_unlock(&extent_root->fs_info->trans_mutex);
e4404d6e
YZ
7202 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7203 /*
7204 * try to update data extent references while
7205 * keeping metadata shared between snapshots.
7206 */
7207 if (pass == 1) {
7208 ret = relocate_one_path(trans, found_root,
7209 path, &first_key, ref_path,
7210 group, reloc_inode);
7211 if (ret < 0)
7212 goto out;
7213 continue;
7214 }
1a40e23b
ZY
7215 /*
7216 * use fallback method to process the remaining
7217 * references.
7218 */
7219 if (!new_extents) {
7220 u64 group_start = group->key.objectid;
d899e052
YZ
7221 new_extents = kmalloc(sizeof(*new_extents),
7222 GFP_NOFS);
7223 nr_extents = 1;
1a40e23b
ZY
7224 ret = get_new_locations(reloc_inode,
7225 extent_key,
d899e052 7226 group_start, 1,
1a40e23b
ZY
7227 &new_extents,
7228 &nr_extents);
d899e052 7229 if (ret)
1a40e23b
ZY
7230 goto out;
7231 }
1a40e23b
ZY
7232 ret = replace_one_extent(trans, found_root,
7233 path, extent_key,
7234 &first_key, ref_path,
7235 new_extents, nr_extents);
e4404d6e 7236 } else {
1a40e23b
ZY
7237 ret = relocate_tree_block(trans, found_root, path,
7238 &first_key, ref_path);
1a40e23b
ZY
7239 }
7240 if (ret < 0)
7241 goto out;
7242 }
7243 ret = 0;
7244out:
7245 btrfs_end_transaction(trans, extent_root);
7246 kfree(new_extents);
7247 kfree(ref_path);
1a40e23b
ZY
7248 return ret;
7249}
5d4f98a2 7250#endif
1a40e23b 7251
ec44a35c
CM
7252static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7253{
7254 u64 num_devices;
7255 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7256 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7257
2b82032c 7258 num_devices = root->fs_info->fs_devices->rw_devices;
ec44a35c
CM
7259 if (num_devices == 1) {
7260 stripped |= BTRFS_BLOCK_GROUP_DUP;
7261 stripped = flags & ~stripped;
7262
7263 /* turn raid0 into single device chunks */
7264 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7265 return stripped;
7266
7267 /* turn mirroring into duplication */
7268 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7269 BTRFS_BLOCK_GROUP_RAID10))
7270 return stripped | BTRFS_BLOCK_GROUP_DUP;
7271 return flags;
7272 } else {
7273 /* they already had raid on here, just return */
ec44a35c
CM
7274 if (flags & stripped)
7275 return flags;
7276
7277 stripped |= BTRFS_BLOCK_GROUP_DUP;
7278 stripped = flags & ~stripped;
7279
7280 /* switch duplicated blocks with raid1 */
7281 if (flags & BTRFS_BLOCK_GROUP_DUP)
7282 return stripped | BTRFS_BLOCK_GROUP_RAID1;
7283
7284 /* turn single device chunks into raid0 */
7285 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7286 }
7287 return flags;
7288}
7289
b2950863 7290static int __alloc_chunk_for_shrink(struct btrfs_root *root,
0ef3e66b
CM
7291 struct btrfs_block_group_cache *shrink_block_group,
7292 int force)
7293{
7294 struct btrfs_trans_handle *trans;
7295 u64 new_alloc_flags;
7296 u64 calc;
7297
c286ac48 7298 spin_lock(&shrink_block_group->lock);
5d4f98a2
YZ
7299 if (btrfs_block_group_used(&shrink_block_group->item) +
7300 shrink_block_group->reserved > 0) {
c286ac48 7301 spin_unlock(&shrink_block_group->lock);
c286ac48 7302
0ef3e66b 7303 trans = btrfs_start_transaction(root, 1);
c286ac48 7304 spin_lock(&shrink_block_group->lock);
7d9eb12c 7305
0ef3e66b
CM
7306 new_alloc_flags = update_block_group_flags(root,
7307 shrink_block_group->flags);
7308 if (new_alloc_flags != shrink_block_group->flags) {
7309 calc =
7310 btrfs_block_group_used(&shrink_block_group->item);
7311 } else {
7312 calc = shrink_block_group->key.offset;
7313 }
c286ac48
CM
7314 spin_unlock(&shrink_block_group->lock);
7315
0ef3e66b
CM
7316 do_chunk_alloc(trans, root->fs_info->extent_root,
7317 calc + 2 * 1024 * 1024, new_alloc_flags, force);
7d9eb12c 7318
0ef3e66b 7319 btrfs_end_transaction(trans, root);
c286ac48
CM
7320 } else
7321 spin_unlock(&shrink_block_group->lock);
0ef3e66b
CM
7322 return 0;
7323}
7324
5d4f98a2
YZ
7325
7326int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
7327 struct btrfs_block_group_cache *group)
7328
7329{
7330 __alloc_chunk_for_shrink(root, group, 1);
7331 set_block_group_readonly(group);
7332 return 0;
7333}
7334
ba1bf481
JB
7335/*
7336 * checks to see if its even possible to relocate this block group.
7337 *
7338 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7339 * ok to go ahead and try.
7340 */
7341int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
1a40e23b 7342{
ba1bf481
JB
7343 struct btrfs_block_group_cache *block_group;
7344 struct btrfs_space_info *space_info;
7345 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7346 struct btrfs_device *device;
7347 int full = 0;
7348 int ret = 0;
1a40e23b 7349
ba1bf481 7350 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1a40e23b 7351
ba1bf481
JB
7352 /* odd, couldn't find the block group, leave it alone */
7353 if (!block_group)
7354 return -1;
1a40e23b 7355
ba1bf481
JB
7356 /* no bytes used, we're good */
7357 if (!btrfs_block_group_used(&block_group->item))
1a40e23b
ZY
7358 goto out;
7359
ba1bf481
JB
7360 space_info = block_group->space_info;
7361 spin_lock(&space_info->lock);
17d217fe 7362
ba1bf481 7363 full = space_info->full;
17d217fe 7364
ba1bf481
JB
7365 /*
7366 * if this is the last block group we have in this space, we can't
7ce618db
CM
7367 * relocate it unless we're able to allocate a new chunk below.
7368 *
7369 * Otherwise, we need to make sure we have room in the space to handle
7370 * all of the extents from this block group. If we can, we're good
ba1bf481 7371 */
7ce618db
CM
7372 if ((space_info->total_bytes != block_group->key.offset) &&
7373 (space_info->bytes_used + space_info->bytes_reserved +
ba1bf481
JB
7374 space_info->bytes_pinned + space_info->bytes_readonly +
7375 btrfs_block_group_used(&block_group->item) <
7ce618db 7376 space_info->total_bytes)) {
ba1bf481
JB
7377 spin_unlock(&space_info->lock);
7378 goto out;
17d217fe 7379 }
ba1bf481 7380 spin_unlock(&space_info->lock);
ea8c2819 7381
ba1bf481
JB
7382 /*
7383 * ok we don't have enough space, but maybe we have free space on our
7384 * devices to allocate new chunks for relocation, so loop through our
7385 * alloc devices and guess if we have enough space. However, if we
7386 * were marked as full, then we know there aren't enough chunks, and we
7387 * can just return.
7388 */
7389 ret = -1;
7390 if (full)
7391 goto out;
ea8c2819 7392
ba1bf481
JB
7393 mutex_lock(&root->fs_info->chunk_mutex);
7394 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7395 u64 min_free = btrfs_block_group_used(&block_group->item);
7396 u64 dev_offset, max_avail;
56bec294 7397
ba1bf481
JB
7398 /*
7399 * check to make sure we can actually find a chunk with enough
7400 * space to fit our block group in.
7401 */
7402 if (device->total_bytes > device->bytes_used + min_free) {
7403 ret = find_free_dev_extent(NULL, device, min_free,
7404 &dev_offset, &max_avail);
7405 if (!ret)
73e48b27 7406 break;
ba1bf481 7407 ret = -1;
725c8463 7408 }
edbd8d4e 7409 }
ba1bf481 7410 mutex_unlock(&root->fs_info->chunk_mutex);
edbd8d4e 7411out:
ba1bf481 7412 btrfs_put_block_group(block_group);
edbd8d4e
CM
7413 return ret;
7414}
7415
b2950863
CH
7416static int find_first_block_group(struct btrfs_root *root,
7417 struct btrfs_path *path, struct btrfs_key *key)
0b86a832 7418{
925baedd 7419 int ret = 0;
0b86a832
CM
7420 struct btrfs_key found_key;
7421 struct extent_buffer *leaf;
7422 int slot;
edbd8d4e 7423
0b86a832
CM
7424 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7425 if (ret < 0)
925baedd
CM
7426 goto out;
7427
d397712b 7428 while (1) {
0b86a832 7429 slot = path->slots[0];
edbd8d4e 7430 leaf = path->nodes[0];
0b86a832
CM
7431 if (slot >= btrfs_header_nritems(leaf)) {
7432 ret = btrfs_next_leaf(root, path);
7433 if (ret == 0)
7434 continue;
7435 if (ret < 0)
925baedd 7436 goto out;
0b86a832 7437 break;
edbd8d4e 7438 }
0b86a832 7439 btrfs_item_key_to_cpu(leaf, &found_key, slot);
edbd8d4e 7440
0b86a832 7441 if (found_key.objectid >= key->objectid &&
925baedd
CM
7442 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7443 ret = 0;
7444 goto out;
7445 }
0b86a832 7446 path->slots[0]++;
edbd8d4e 7447 }
925baedd 7448out:
0b86a832 7449 return ret;
edbd8d4e
CM
7450}
7451
1a40e23b
ZY
7452int btrfs_free_block_groups(struct btrfs_fs_info *info)
7453{
7454 struct btrfs_block_group_cache *block_group;
4184ea7f 7455 struct btrfs_space_info *space_info;
11833d66 7456 struct btrfs_caching_control *caching_ctl;
1a40e23b
ZY
7457 struct rb_node *n;
7458
11833d66
YZ
7459 down_write(&info->extent_commit_sem);
7460 while (!list_empty(&info->caching_block_groups)) {
7461 caching_ctl = list_entry(info->caching_block_groups.next,
7462 struct btrfs_caching_control, list);
7463 list_del(&caching_ctl->list);
7464 put_caching_control(caching_ctl);
7465 }
7466 up_write(&info->extent_commit_sem);
7467
1a40e23b
ZY
7468 spin_lock(&info->block_group_cache_lock);
7469 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7470 block_group = rb_entry(n, struct btrfs_block_group_cache,
7471 cache_node);
1a40e23b
ZY
7472 rb_erase(&block_group->cache_node,
7473 &info->block_group_cache_tree);
d899e052
YZ
7474 spin_unlock(&info->block_group_cache_lock);
7475
80eb234a 7476 down_write(&block_group->space_info->groups_sem);
1a40e23b 7477 list_del(&block_group->list);
80eb234a 7478 up_write(&block_group->space_info->groups_sem);
d2fb3437 7479
817d52f8 7480 if (block_group->cached == BTRFS_CACHE_STARTED)
11833d66 7481 wait_block_group_cache_done(block_group);
817d52f8
JB
7482
7483 btrfs_remove_free_space_cache(block_group);
11dfe35a 7484 btrfs_put_block_group(block_group);
d899e052
YZ
7485
7486 spin_lock(&info->block_group_cache_lock);
1a40e23b
ZY
7487 }
7488 spin_unlock(&info->block_group_cache_lock);
4184ea7f
CM
7489
7490 /* now that all the block groups are freed, go through and
7491 * free all the space_info structs. This is only called during
7492 * the final stages of unmount, and so we know nobody is
7493 * using them. We call synchronize_rcu() once before we start,
7494 * just to be on the safe side.
7495 */
7496 synchronize_rcu();
7497
7498 while(!list_empty(&info->space_info)) {
7499 space_info = list_entry(info->space_info.next,
7500 struct btrfs_space_info,
7501 list);
7502
7503 list_del(&space_info->list);
7504 kfree(space_info);
7505 }
1a40e23b
ZY
7506 return 0;
7507}
7508
b742bb82
YZ
7509static void __link_block_group(struct btrfs_space_info *space_info,
7510 struct btrfs_block_group_cache *cache)
7511{
7512 int index = get_block_group_index(cache);
7513
7514 down_write(&space_info->groups_sem);
7515 list_add_tail(&cache->list, &space_info->block_groups[index]);
7516 up_write(&space_info->groups_sem);
7517}
7518
9078a3e1
CM
7519int btrfs_read_block_groups(struct btrfs_root *root)
7520{
7521 struct btrfs_path *path;
7522 int ret;
9078a3e1 7523 struct btrfs_block_group_cache *cache;
be744175 7524 struct btrfs_fs_info *info = root->fs_info;
6324fbf3 7525 struct btrfs_space_info *space_info;
9078a3e1
CM
7526 struct btrfs_key key;
7527 struct btrfs_key found_key;
5f39d397 7528 struct extent_buffer *leaf;
96b5179d 7529
be744175 7530 root = info->extent_root;
9078a3e1 7531 key.objectid = 0;
0b86a832 7532 key.offset = 0;
9078a3e1 7533 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
9078a3e1
CM
7534 path = btrfs_alloc_path();
7535 if (!path)
7536 return -ENOMEM;
7537
d397712b 7538 while (1) {
0b86a832 7539 ret = find_first_block_group(root, path, &key);
b742bb82
YZ
7540 if (ret > 0)
7541 break;
0b86a832
CM
7542 if (ret != 0)
7543 goto error;
7544
5f39d397
CM
7545 leaf = path->nodes[0];
7546 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8f18cf13 7547 cache = kzalloc(sizeof(*cache), GFP_NOFS);
9078a3e1 7548 if (!cache) {
0b86a832 7549 ret = -ENOMEM;
9078a3e1
CM
7550 break;
7551 }
3e1ad54f 7552
d2fb3437 7553 atomic_set(&cache->count, 1);
c286ac48 7554 spin_lock_init(&cache->lock);
6226cb0a 7555 spin_lock_init(&cache->tree_lock);
817d52f8 7556 cache->fs_info = info;
0f9dd46c 7557 INIT_LIST_HEAD(&cache->list);
fa9c0d79 7558 INIT_LIST_HEAD(&cache->cluster_list);
96303081
JB
7559
7560 /*
7561 * we only want to have 32k of ram per block group for keeping
7562 * track of free space, and if we pass 1/2 of that we want to
7563 * start converting things over to using bitmaps
7564 */
7565 cache->extents_thresh = ((1024 * 32) / 2) /
7566 sizeof(struct btrfs_free_space);
7567
5f39d397
CM
7568 read_extent_buffer(leaf, &cache->item,
7569 btrfs_item_ptr_offset(leaf, path->slots[0]),
7570 sizeof(cache->item));
9078a3e1 7571 memcpy(&cache->key, &found_key, sizeof(found_key));
0b86a832 7572
9078a3e1
CM
7573 key.objectid = found_key.objectid + found_key.offset;
7574 btrfs_release_path(root, path);
0b86a832 7575 cache->flags = btrfs_block_group_flags(&cache->item);
817d52f8
JB
7576 cache->sectorsize = root->sectorsize;
7577
817d52f8
JB
7578 /*
7579 * check for two cases, either we are full, and therefore
7580 * don't need to bother with the caching work since we won't
7581 * find any space, or we are empty, and we can just add all
7582 * the space in and be done with it. This saves us _alot_ of
7583 * time, particularly in the full case.
7584 */
7585 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
1b2da372 7586 exclude_super_stripes(root, cache);
11833d66 7587 cache->last_byte_to_unpin = (u64)-1;
817d52f8 7588 cache->cached = BTRFS_CACHE_FINISHED;
1b2da372 7589 free_excluded_extents(root, cache);
817d52f8 7590 } else if (btrfs_block_group_used(&cache->item) == 0) {
11833d66
YZ
7591 exclude_super_stripes(root, cache);
7592 cache->last_byte_to_unpin = (u64)-1;
817d52f8
JB
7593 cache->cached = BTRFS_CACHE_FINISHED;
7594 add_new_free_space(cache, root->fs_info,
7595 found_key.objectid,
7596 found_key.objectid +
7597 found_key.offset);
11833d66 7598 free_excluded_extents(root, cache);
817d52f8 7599 }
96b5179d 7600
6324fbf3
CM
7601 ret = update_space_info(info, cache->flags, found_key.offset,
7602 btrfs_block_group_used(&cache->item),
7603 &space_info);
7604 BUG_ON(ret);
7605 cache->space_info = space_info;
1b2da372
JB
7606 spin_lock(&cache->space_info->lock);
7607 cache->space_info->bytes_super += cache->bytes_super;
7608 spin_unlock(&cache->space_info->lock);
7609
b742bb82 7610 __link_block_group(space_info, cache);
0f9dd46c
JB
7611
7612 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7613 BUG_ON(ret);
75ccf47d
CM
7614
7615 set_avail_alloc_bits(root->fs_info, cache->flags);
2b82032c
YZ
7616 if (btrfs_chunk_readonly(root, cache->key.objectid))
7617 set_block_group_readonly(cache);
9078a3e1 7618 }
b742bb82
YZ
7619
7620 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7621 if (!(get_alloc_profile(root, space_info->flags) &
7622 (BTRFS_BLOCK_GROUP_RAID10 |
7623 BTRFS_BLOCK_GROUP_RAID1 |
7624 BTRFS_BLOCK_GROUP_DUP)))
7625 continue;
7626 /*
7627 * avoid allocating from un-mirrored block group if there are
7628 * mirrored block groups.
7629 */
7630 list_for_each_entry(cache, &space_info->block_groups[3], list)
7631 set_block_group_readonly(cache);
7632 list_for_each_entry(cache, &space_info->block_groups[4], list)
7633 set_block_group_readonly(cache);
7634 }
0b86a832
CM
7635 ret = 0;
7636error:
9078a3e1 7637 btrfs_free_path(path);
0b86a832 7638 return ret;
9078a3e1 7639}
6324fbf3
CM
7640
7641int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7642 struct btrfs_root *root, u64 bytes_used,
e17cade2 7643 u64 type, u64 chunk_objectid, u64 chunk_offset,
6324fbf3
CM
7644 u64 size)
7645{
7646 int ret;
6324fbf3
CM
7647 struct btrfs_root *extent_root;
7648 struct btrfs_block_group_cache *cache;
6324fbf3
CM
7649
7650 extent_root = root->fs_info->extent_root;
6324fbf3 7651
12fcfd22 7652 root->fs_info->last_trans_log_full_commit = trans->transid;
e02119d5 7653
8f18cf13 7654 cache = kzalloc(sizeof(*cache), GFP_NOFS);
0f9dd46c
JB
7655 if (!cache)
7656 return -ENOMEM;
7657
e17cade2 7658 cache->key.objectid = chunk_offset;
6324fbf3 7659 cache->key.offset = size;
d2fb3437 7660 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
96303081
JB
7661 cache->sectorsize = root->sectorsize;
7662
7663 /*
7664 * we only want to have 32k of ram per block group for keeping track
7665 * of free space, and if we pass 1/2 of that we want to start
7666 * converting things over to using bitmaps
7667 */
7668 cache->extents_thresh = ((1024 * 32) / 2) /
7669 sizeof(struct btrfs_free_space);
d2fb3437 7670 atomic_set(&cache->count, 1);
c286ac48 7671 spin_lock_init(&cache->lock);
6226cb0a 7672 spin_lock_init(&cache->tree_lock);
0f9dd46c 7673 INIT_LIST_HEAD(&cache->list);
fa9c0d79 7674 INIT_LIST_HEAD(&cache->cluster_list);
0ef3e66b 7675
6324fbf3 7676 btrfs_set_block_group_used(&cache->item, bytes_used);
6324fbf3
CM
7677 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7678 cache->flags = type;
7679 btrfs_set_block_group_flags(&cache->item, type);
7680
11833d66 7681 cache->last_byte_to_unpin = (u64)-1;
817d52f8 7682 cache->cached = BTRFS_CACHE_FINISHED;
11833d66 7683 exclude_super_stripes(root, cache);
96303081 7684
817d52f8
JB
7685 add_new_free_space(cache, root->fs_info, chunk_offset,
7686 chunk_offset + size);
7687
11833d66
YZ
7688 free_excluded_extents(root, cache);
7689
6324fbf3
CM
7690 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7691 &cache->space_info);
7692 BUG_ON(ret);
1b2da372
JB
7693
7694 spin_lock(&cache->space_info->lock);
7695 cache->space_info->bytes_super += cache->bytes_super;
7696 spin_unlock(&cache->space_info->lock);
7697
b742bb82 7698 __link_block_group(cache->space_info, cache);
6324fbf3 7699
0f9dd46c
JB
7700 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7701 BUG_ON(ret);
c286ac48 7702
6324fbf3
CM
7703 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7704 sizeof(cache->item));
7705 BUG_ON(ret);
7706
d18a2c44 7707 set_avail_alloc_bits(extent_root->fs_info, type);
925baedd 7708
6324fbf3
CM
7709 return 0;
7710}
1a40e23b
ZY
7711
7712int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7713 struct btrfs_root *root, u64 group_start)
7714{
7715 struct btrfs_path *path;
7716 struct btrfs_block_group_cache *block_group;
44fb5511 7717 struct btrfs_free_cluster *cluster;
1a40e23b
ZY
7718 struct btrfs_key key;
7719 int ret;
7720
1a40e23b
ZY
7721 root = root->fs_info->extent_root;
7722
7723 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7724 BUG_ON(!block_group);
c146afad 7725 BUG_ON(!block_group->ro);
1a40e23b
ZY
7726
7727 memcpy(&key, &block_group->key, sizeof(key));
7728
44fb5511
CM
7729 /* make sure this block group isn't part of an allocation cluster */
7730 cluster = &root->fs_info->data_alloc_cluster;
7731 spin_lock(&cluster->refill_lock);
7732 btrfs_return_cluster_to_free_space(block_group, cluster);
7733 spin_unlock(&cluster->refill_lock);
7734
7735 /*
7736 * make sure this block group isn't part of a metadata
7737 * allocation cluster
7738 */
7739 cluster = &root->fs_info->meta_alloc_cluster;
7740 spin_lock(&cluster->refill_lock);
7741 btrfs_return_cluster_to_free_space(block_group, cluster);
7742 spin_unlock(&cluster->refill_lock);
7743
1a40e23b
ZY
7744 path = btrfs_alloc_path();
7745 BUG_ON(!path);
7746
3dfdb934 7747 spin_lock(&root->fs_info->block_group_cache_lock);
1a40e23b
ZY
7748 rb_erase(&block_group->cache_node,
7749 &root->fs_info->block_group_cache_tree);
3dfdb934 7750 spin_unlock(&root->fs_info->block_group_cache_lock);
817d52f8 7751
80eb234a 7752 down_write(&block_group->space_info->groups_sem);
44fb5511
CM
7753 /*
7754 * we must use list_del_init so people can check to see if they
7755 * are still on the list after taking the semaphore
7756 */
7757 list_del_init(&block_group->list);
80eb234a 7758 up_write(&block_group->space_info->groups_sem);
1a40e23b 7759
817d52f8 7760 if (block_group->cached == BTRFS_CACHE_STARTED)
11833d66 7761 wait_block_group_cache_done(block_group);
817d52f8
JB
7762
7763 btrfs_remove_free_space_cache(block_group);
7764
c146afad
YZ
7765 spin_lock(&block_group->space_info->lock);
7766 block_group->space_info->total_bytes -= block_group->key.offset;
7767 block_group->space_info->bytes_readonly -= block_group->key.offset;
7768 spin_unlock(&block_group->space_info->lock);
283bb197
CM
7769
7770 btrfs_clear_space_info_full(root->fs_info);
c146afad 7771
fa9c0d79
CM
7772 btrfs_put_block_group(block_group);
7773 btrfs_put_block_group(block_group);
1a40e23b
ZY
7774
7775 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7776 if (ret > 0)
7777 ret = -EIO;
7778 if (ret < 0)
7779 goto out;
7780
7781 ret = btrfs_del_item(trans, root, path);
7782out:
7783 btrfs_free_path(path);
7784 return ret;
7785}