]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/block/drbd/drbd_worker.c
drbd: use rolling marks for resync speed calculation
[net-next-2.6.git] / drivers / block / drbd / drbd_worker.c
CommitLineData
b411b363
PR
1/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
b411b363 26#include <linux/module.h>
b411b363
PR
27#include <linux/drbd.h>
28#include <linux/sched.h>
29#include <linux/smp_lock.h>
30#include <linux/wait.h>
31#include <linux/mm.h>
32#include <linux/memcontrol.h>
33#include <linux/mm_inline.h>
34#include <linux/slab.h>
35#include <linux/random.h>
b411b363
PR
36#include <linux/string.h>
37#include <linux/scatterlist.h>
38
39#include "drbd_int.h"
40#include "drbd_req.h"
b411b363 41
b411b363
PR
42static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
43
44
45
46/* defined here:
47 drbd_md_io_complete
45bb912b 48 drbd_endio_sec
b411b363
PR
49 drbd_endio_pri
50
51 * more endio handlers:
52 atodb_endio in drbd_actlog.c
53 drbd_bm_async_io_complete in drbd_bitmap.c
54
55 * For all these callbacks, note the following:
56 * The callbacks will be called in irq context by the IDE drivers,
57 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
58 * Try to get the locking right :)
59 *
60 */
61
62
63/* About the global_state_lock
64 Each state transition on an device holds a read lock. In case we have
65 to evaluate the sync after dependencies, we grab a write lock, because
66 we need stable states on all devices for that. */
67rwlock_t global_state_lock;
68
69/* used for synchronous meta data and bitmap IO
70 * submitted by drbd_md_sync_page_io()
71 */
72void drbd_md_io_complete(struct bio *bio, int error)
73{
74 struct drbd_md_io *md_io;
75
76 md_io = (struct drbd_md_io *)bio->bi_private;
77 md_io->error = error;
78
b411b363
PR
79 complete(&md_io->event);
80}
81
82/* reads on behalf of the partner,
83 * "submitted" by the receiver
84 */
45bb912b 85void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
b411b363
PR
86{
87 unsigned long flags = 0;
45bb912b 88 struct drbd_conf *mdev = e->mdev;
b411b363
PR
89
90 D_ASSERT(e->block_id != ID_VACANT);
91
b411b363
PR
92 spin_lock_irqsave(&mdev->req_lock, flags);
93 mdev->read_cnt += e->size >> 9;
94 list_del(&e->w.list);
95 if (list_empty(&mdev->read_ee))
96 wake_up(&mdev->ee_wait);
45bb912b
LE
97 if (test_bit(__EE_WAS_ERROR, &e->flags))
98 __drbd_chk_io_error(mdev, FALSE);
b411b363
PR
99 spin_unlock_irqrestore(&mdev->req_lock, flags);
100
b411b363
PR
101 drbd_queue_work(&mdev->data.work, &e->w);
102 put_ldev(mdev);
b411b363
PR
103}
104
45bb912b
LE
105static int is_failed_barrier(int ee_flags)
106{
107 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
108 == (EE_IS_BARRIER|EE_WAS_ERROR);
109}
110
b411b363 111/* writes on behalf of the partner, or resync writes,
45bb912b
LE
112 * "submitted" by the receiver, final stage. */
113static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
b411b363
PR
114{
115 unsigned long flags = 0;
45bb912b 116 struct drbd_conf *mdev = e->mdev;
b411b363
PR
117 sector_t e_sector;
118 int do_wake;
119 int is_syncer_req;
120 int do_al_complete_io;
b411b363 121
45bb912b
LE
122 /* if this is a failed barrier request, disable use of barriers,
123 * and schedule for resubmission */
124 if (is_failed_barrier(e->flags)) {
b411b363
PR
125 drbd_bump_write_ordering(mdev, WO_bdev_flush);
126 spin_lock_irqsave(&mdev->req_lock, flags);
127 list_del(&e->w.list);
fc8ce194 128 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
b411b363
PR
129 e->w.cb = w_e_reissue;
130 /* put_ldev actually happens below, once we come here again. */
131 __release(local);
132 spin_unlock_irqrestore(&mdev->req_lock, flags);
133 drbd_queue_work(&mdev->data.work, &e->w);
134 return;
135 }
136
137 D_ASSERT(e->block_id != ID_VACANT);
138
b411b363
PR
139 /* after we moved e to done_ee,
140 * we may no longer access it,
141 * it may be freed/reused already!
142 * (as soon as we release the req_lock) */
143 e_sector = e->sector;
144 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
45bb912b 145 is_syncer_req = is_syncer_block_id(e->block_id);
b411b363 146
45bb912b
LE
147 spin_lock_irqsave(&mdev->req_lock, flags);
148 mdev->writ_cnt += e->size >> 9;
b411b363
PR
149 list_del(&e->w.list); /* has been on active_ee or sync_ee */
150 list_add_tail(&e->w.list, &mdev->done_ee);
151
b411b363
PR
152 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
153 * neither did we wake possibly waiting conflicting requests.
154 * done from "drbd_process_done_ee" within the appropriate w.cb
155 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
156
157 do_wake = is_syncer_req
158 ? list_empty(&mdev->sync_ee)
159 : list_empty(&mdev->active_ee);
160
45bb912b 161 if (test_bit(__EE_WAS_ERROR, &e->flags))
b411b363
PR
162 __drbd_chk_io_error(mdev, FALSE);
163 spin_unlock_irqrestore(&mdev->req_lock, flags);
164
165 if (is_syncer_req)
166 drbd_rs_complete_io(mdev, e_sector);
167
168 if (do_wake)
169 wake_up(&mdev->ee_wait);
170
171 if (do_al_complete_io)
172 drbd_al_complete_io(mdev, e_sector);
173
174 wake_asender(mdev);
175 put_ldev(mdev);
45bb912b 176}
b411b363 177
45bb912b
LE
178/* writes on behalf of the partner, or resync writes,
179 * "submitted" by the receiver.
180 */
181void drbd_endio_sec(struct bio *bio, int error)
182{
183 struct drbd_epoch_entry *e = bio->bi_private;
184 struct drbd_conf *mdev = e->mdev;
185 int uptodate = bio_flagged(bio, BIO_UPTODATE);
186 int is_write = bio_data_dir(bio) == WRITE;
187
188 if (error)
189 dev_warn(DEV, "%s: error=%d s=%llus\n",
190 is_write ? "write" : "read", error,
191 (unsigned long long)e->sector);
192 if (!error && !uptodate) {
193 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
194 is_write ? "write" : "read",
195 (unsigned long long)e->sector);
196 /* strange behavior of some lower level drivers...
197 * fail the request by clearing the uptodate flag,
198 * but do not return any error?! */
199 error = -EIO;
200 }
201
202 if (error)
203 set_bit(__EE_WAS_ERROR, &e->flags);
204
205 bio_put(bio); /* no need for the bio anymore */
206 if (atomic_dec_and_test(&e->pending_bios)) {
207 if (is_write)
208 drbd_endio_write_sec_final(e);
209 else
210 drbd_endio_read_sec_final(e);
211 }
b411b363
PR
212}
213
214/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
215 */
216void drbd_endio_pri(struct bio *bio, int error)
217{
218 unsigned long flags;
219 struct drbd_request *req = bio->bi_private;
220 struct drbd_conf *mdev = req->mdev;
221 struct bio_and_error m;
222 enum drbd_req_event what;
223 int uptodate = bio_flagged(bio, BIO_UPTODATE);
224
b411b363
PR
225 if (!error && !uptodate) {
226 dev_warn(DEV, "p %s: setting error to -EIO\n",
227 bio_data_dir(bio) == WRITE ? "write" : "read");
228 /* strange behavior of some lower level drivers...
229 * fail the request by clearing the uptodate flag,
230 * but do not return any error?! */
231 error = -EIO;
232 }
233
b411b363
PR
234 /* to avoid recursion in __req_mod */
235 if (unlikely(error)) {
236 what = (bio_data_dir(bio) == WRITE)
237 ? write_completed_with_error
5c3c7e64 238 : (bio_rw(bio) == READ)
b411b363
PR
239 ? read_completed_with_error
240 : read_ahead_completed_with_error;
241 } else
242 what = completed_ok;
243
244 bio_put(req->private_bio);
245 req->private_bio = ERR_PTR(error);
246
247 spin_lock_irqsave(&mdev->req_lock, flags);
248 __req_mod(req, what, &m);
249 spin_unlock_irqrestore(&mdev->req_lock, flags);
250
251 if (m.bio)
252 complete_master_bio(mdev, &m);
253}
254
b411b363
PR
255int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
256{
257 struct drbd_request *req = container_of(w, struct drbd_request, w);
258
259 /* We should not detach for read io-error,
260 * but try to WRITE the P_DATA_REPLY to the failed location,
261 * to give the disk the chance to relocate that block */
262
263 spin_lock_irq(&mdev->req_lock);
d255e5ff
LE
264 if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
265 _req_mod(req, read_retry_remote_canceled);
b411b363 266 spin_unlock_irq(&mdev->req_lock);
b411b363
PR
267 return 1;
268 }
269 spin_unlock_irq(&mdev->req_lock);
270
271 return w_send_read_req(mdev, w, 0);
272}
273
274int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
275{
276 ERR_IF(cancel) return 1;
277 dev_err(DEV, "resync inactive, but callback triggered??\n");
278 return 1; /* Simply ignore this! */
279}
280
45bb912b
LE
281void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
282{
283 struct hash_desc desc;
284 struct scatterlist sg;
285 struct page *page = e->pages;
286 struct page *tmp;
287 unsigned len;
288
289 desc.tfm = tfm;
290 desc.flags = 0;
291
292 sg_init_table(&sg, 1);
293 crypto_hash_init(&desc);
294
295 while ((tmp = page_chain_next(page))) {
296 /* all but the last page will be fully used */
297 sg_set_page(&sg, page, PAGE_SIZE, 0);
298 crypto_hash_update(&desc, &sg, sg.length);
299 page = tmp;
300 }
301 /* and now the last, possibly only partially used page */
302 len = e->size & (PAGE_SIZE - 1);
303 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
304 crypto_hash_update(&desc, &sg, sg.length);
305 crypto_hash_final(&desc, digest);
306}
307
308void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
b411b363
PR
309{
310 struct hash_desc desc;
311 struct scatterlist sg;
312 struct bio_vec *bvec;
313 int i;
314
315 desc.tfm = tfm;
316 desc.flags = 0;
317
318 sg_init_table(&sg, 1);
319 crypto_hash_init(&desc);
320
321 __bio_for_each_segment(bvec, bio, i, 0) {
322 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
323 crypto_hash_update(&desc, &sg, sg.length);
324 }
325 crypto_hash_final(&desc, digest);
326}
327
328static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
329{
330 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
331 int digest_size;
332 void *digest;
333 int ok;
334
335 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
336
337 if (unlikely(cancel)) {
338 drbd_free_ee(mdev, e);
339 return 1;
340 }
341
45bb912b 342 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
343 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
344 digest = kmalloc(digest_size, GFP_NOIO);
345 if (digest) {
45bb912b 346 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
b411b363
PR
347
348 inc_rs_pending(mdev);
349 ok = drbd_send_drequest_csum(mdev,
350 e->sector,
351 e->size,
352 digest,
353 digest_size,
354 P_CSUM_RS_REQUEST);
355 kfree(digest);
356 } else {
357 dev_err(DEV, "kmalloc() of digest failed.\n");
358 ok = 0;
359 }
360 } else
361 ok = 1;
362
363 drbd_free_ee(mdev, e);
364
365 if (unlikely(!ok))
366 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
367 return ok;
368}
369
370#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
371
372static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
373{
374 struct drbd_epoch_entry *e;
375
376 if (!get_ldev(mdev))
377 return 0;
378
379 /* GFP_TRY, because if there is no memory available right now, this may
380 * be rescheduled for later. It is "only" background resync, after all. */
381 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
45bb912b
LE
382 if (!e)
383 goto fail;
b411b363
PR
384
385 spin_lock_irq(&mdev->req_lock);
386 list_add(&e->w.list, &mdev->read_ee);
387 spin_unlock_irq(&mdev->req_lock);
388
b411b363 389 e->w.cb = w_e_send_csum;
45bb912b
LE
390 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
391 return 1;
b411b363 392
45bb912b
LE
393 drbd_free_ee(mdev, e);
394fail:
395 put_ldev(mdev);
396 return 2;
b411b363
PR
397}
398
399void resync_timer_fn(unsigned long data)
400{
401 unsigned long flags;
402 struct drbd_conf *mdev = (struct drbd_conf *) data;
403 int queue;
404
405 spin_lock_irqsave(&mdev->req_lock, flags);
406
407 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
408 queue = 1;
409 if (mdev->state.conn == C_VERIFY_S)
410 mdev->resync_work.cb = w_make_ov_request;
411 else
412 mdev->resync_work.cb = w_make_resync_request;
413 } else {
414 queue = 0;
415 mdev->resync_work.cb = w_resync_inactive;
416 }
417
418 spin_unlock_irqrestore(&mdev->req_lock, flags);
419
420 /* harmless race: list_empty outside data.work.q_lock */
421 if (list_empty(&mdev->resync_work.list) && queue)
422 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
423}
424
778f271d
PR
425static void fifo_set(struct fifo_buffer *fb, int value)
426{
427 int i;
428
429 for (i = 0; i < fb->size; i++)
430 fb->values[i] += value;
431}
432
433static int fifo_push(struct fifo_buffer *fb, int value)
434{
435 int ov;
436
437 ov = fb->values[fb->head_index];
438 fb->values[fb->head_index++] = value;
439
440 if (fb->head_index >= fb->size)
441 fb->head_index = 0;
442
443 return ov;
444}
445
446static void fifo_add_val(struct fifo_buffer *fb, int value)
447{
448 int i;
449
450 for (i = 0; i < fb->size; i++)
451 fb->values[i] += value;
452}
453
454int drbd_rs_controller(struct drbd_conf *mdev)
455{
456 unsigned int sect_in; /* Number of sectors that came in since the last turn */
457 unsigned int want; /* The number of sectors we want in the proxy */
458 int req_sect; /* Number of sectors to request in this turn */
459 int correction; /* Number of sectors more we need in the proxy*/
460 int cps; /* correction per invocation of drbd_rs_controller() */
461 int steps; /* Number of time steps to plan ahead */
462 int curr_corr;
463 int max_sect;
464
465 sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
466 mdev->rs_in_flight -= sect_in;
467
468 spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
469
470 steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
471
472 if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
473 want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
474 } else { /* normal path */
475 want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
476 sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
477 }
478
479 correction = want - mdev->rs_in_flight - mdev->rs_planed;
480
481 /* Plan ahead */
482 cps = correction / steps;
483 fifo_add_val(&mdev->rs_plan_s, cps);
484 mdev->rs_planed += cps * steps;
485
486 /* What we do in this step */
487 curr_corr = fifo_push(&mdev->rs_plan_s, 0);
488 spin_unlock(&mdev->peer_seq_lock);
489 mdev->rs_planed -= curr_corr;
490
491 req_sect = sect_in + curr_corr;
492 if (req_sect < 0)
493 req_sect = 0;
494
495 max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
496 if (req_sect > max_sect)
497 req_sect = max_sect;
498
499 /*
500 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
501 sect_in, mdev->rs_in_flight, want, correction,
502 steps, cps, mdev->rs_planed, curr_corr, req_sect);
503 */
504
505 return req_sect;
506}
507
b411b363
PR
508int w_make_resync_request(struct drbd_conf *mdev,
509 struct drbd_work *w, int cancel)
510{
511 unsigned long bit;
512 sector_t sector;
513 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
bb3d000c 514 int max_segment_size;
d207450c 515 int number, i, rollback_i, size, pe, mx;
b411b363
PR
516 int align, queued, sndbuf;
517
518 if (unlikely(cancel))
519 return 1;
520
521 if (unlikely(mdev->state.conn < C_CONNECTED)) {
522 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
523 return 0;
524 }
525
526 if (mdev->state.conn != C_SYNC_TARGET)
527 dev_err(DEV, "%s in w_make_resync_request\n",
528 drbd_conn_str(mdev->state.conn));
529
530 if (!get_ldev(mdev)) {
531 /* Since we only need to access mdev->rsync a
532 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
533 to continue resync with a broken disk makes no sense at
534 all */
535 dev_err(DEV, "Disk broke down during resync!\n");
536 mdev->resync_work.cb = w_resync_inactive;
537 return 1;
538 }
539
bb3d000c
LE
540 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
541 * if it should be necessary */
542 max_segment_size = mdev->agreed_pro_version < 94 ?
543 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
544
778f271d
PR
545 if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
546 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
547 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
548 } else {
549 mdev->c_sync_rate = mdev->sync_conf.rate;
550 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
551 }
b411b363
PR
552 pe = atomic_read(&mdev->rs_pending_cnt);
553
554 mutex_lock(&mdev->data.mutex);
555 if (mdev->data.socket)
556 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
557 else
558 mx = 1;
559 mutex_unlock(&mdev->data.mutex);
560
561 /* For resync rates >160MB/sec, allow more pending RS requests */
562 if (number > mx)
563 mx = number;
564
565 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
566 if ((pe + number) > mx) {
567 number = mx - pe;
568 }
569
570 for (i = 0; i < number; i++) {
571 /* Stop generating RS requests, when half of the send buffer is filled */
572 mutex_lock(&mdev->data.mutex);
573 if (mdev->data.socket) {
574 queued = mdev->data.socket->sk->sk_wmem_queued;
575 sndbuf = mdev->data.socket->sk->sk_sndbuf;
576 } else {
577 queued = 1;
578 sndbuf = 0;
579 }
580 mutex_unlock(&mdev->data.mutex);
581 if (queued > sndbuf / 2)
582 goto requeue;
583
584next_sector:
585 size = BM_BLOCK_SIZE;
586 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
587
588 if (bit == -1UL) {
589 mdev->bm_resync_fo = drbd_bm_bits(mdev);
590 mdev->resync_work.cb = w_resync_inactive;
591 put_ldev(mdev);
592 return 1;
593 }
594
595 sector = BM_BIT_TO_SECT(bit);
596
597 if (drbd_try_rs_begin_io(mdev, sector)) {
598 mdev->bm_resync_fo = bit;
599 goto requeue;
600 }
601 mdev->bm_resync_fo = bit + 1;
602
603 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
604 drbd_rs_complete_io(mdev, sector);
605 goto next_sector;
606 }
607
608#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
609 /* try to find some adjacent bits.
610 * we stop if we have already the maximum req size.
611 *
612 * Additionally always align bigger requests, in order to
613 * be prepared for all stripe sizes of software RAIDs.
b411b363
PR
614 */
615 align = 1;
d207450c 616 rollback_i = i;
b411b363
PR
617 for (;;) {
618 if (size + BM_BLOCK_SIZE > max_segment_size)
619 break;
620
621 /* Be always aligned */
622 if (sector & ((1<<(align+3))-1))
623 break;
624
625 /* do not cross extent boundaries */
626 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
627 break;
628 /* now, is it actually dirty, after all?
629 * caution, drbd_bm_test_bit is tri-state for some
630 * obscure reason; ( b == 0 ) would get the out-of-band
631 * only accidentally right because of the "oddly sized"
632 * adjustment below */
633 if (drbd_bm_test_bit(mdev, bit+1) != 1)
634 break;
635 bit++;
636 size += BM_BLOCK_SIZE;
637 if ((BM_BLOCK_SIZE << align) <= size)
638 align++;
639 i++;
640 }
641 /* if we merged some,
642 * reset the offset to start the next drbd_bm_find_next from */
643 if (size > BM_BLOCK_SIZE)
644 mdev->bm_resync_fo = bit + 1;
645#endif
646
647 /* adjust very last sectors, in case we are oddly sized */
648 if (sector + (size>>9) > capacity)
649 size = (capacity-sector)<<9;
650 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
651 switch (read_for_csum(mdev, sector, size)) {
652 case 0: /* Disk failure*/
653 put_ldev(mdev);
654 return 0;
655 case 2: /* Allocation failed */
656 drbd_rs_complete_io(mdev, sector);
657 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
d207450c 658 i = rollback_i;
b411b363
PR
659 goto requeue;
660 /* case 1: everything ok */
661 }
662 } else {
663 inc_rs_pending(mdev);
664 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
665 sector, size, ID_SYNCER)) {
666 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
667 dec_rs_pending(mdev);
668 put_ldev(mdev);
669 return 0;
670 }
671 }
672 }
673
674 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
675 /* last syncer _request_ was sent,
676 * but the P_RS_DATA_REPLY not yet received. sync will end (and
677 * next sync group will resume), as soon as we receive the last
678 * resync data block, and the last bit is cleared.
679 * until then resync "work" is "inactive" ...
680 */
681 mdev->resync_work.cb = w_resync_inactive;
682 put_ldev(mdev);
683 return 1;
684 }
685
686 requeue:
778f271d 687 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
b411b363
PR
688 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
689 put_ldev(mdev);
690 return 1;
691}
692
693static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
694{
695 int number, i, size;
696 sector_t sector;
697 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
698
699 if (unlikely(cancel))
700 return 1;
701
702 if (unlikely(mdev->state.conn < C_CONNECTED)) {
703 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
704 return 0;
705 }
706
707 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
708 if (atomic_read(&mdev->rs_pending_cnt) > number)
709 goto requeue;
710
711 number -= atomic_read(&mdev->rs_pending_cnt);
712
713 sector = mdev->ov_position;
714 for (i = 0; i < number; i++) {
715 if (sector >= capacity) {
716 mdev->resync_work.cb = w_resync_inactive;
717 return 1;
718 }
719
720 size = BM_BLOCK_SIZE;
721
722 if (drbd_try_rs_begin_io(mdev, sector)) {
723 mdev->ov_position = sector;
724 goto requeue;
725 }
726
727 if (sector + (size>>9) > capacity)
728 size = (capacity-sector)<<9;
729
730 inc_rs_pending(mdev);
731 if (!drbd_send_ov_request(mdev, sector, size)) {
732 dec_rs_pending(mdev);
733 return 0;
734 }
735 sector += BM_SECT_PER_BIT;
736 }
737 mdev->ov_position = sector;
738
739 requeue:
740 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
741 return 1;
742}
743
744
745int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
746{
747 kfree(w);
748 ov_oos_print(mdev);
749 drbd_resync_finished(mdev);
750
751 return 1;
752}
753
754static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
755{
756 kfree(w);
757
758 drbd_resync_finished(mdev);
759
760 return 1;
761}
762
763int drbd_resync_finished(struct drbd_conf *mdev)
764{
765 unsigned long db, dt, dbdt;
766 unsigned long n_oos;
767 union drbd_state os, ns;
768 struct drbd_work *w;
769 char *khelper_cmd = NULL;
770
771 /* Remove all elements from the resync LRU. Since future actions
772 * might set bits in the (main) bitmap, then the entries in the
773 * resync LRU would be wrong. */
774 if (drbd_rs_del_all(mdev)) {
775 /* In case this is not possible now, most probably because
776 * there are P_RS_DATA_REPLY Packets lingering on the worker's
777 * queue (or even the read operations for those packets
778 * is not finished by now). Retry in 100ms. */
779
780 drbd_kick_lo(mdev);
781 __set_current_state(TASK_INTERRUPTIBLE);
782 schedule_timeout(HZ / 10);
783 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
784 if (w) {
785 w->cb = w_resync_finished;
786 drbd_queue_work(&mdev->data.work, w);
787 return 1;
788 }
789 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
790 }
791
792 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
793 if (dt <= 0)
794 dt = 1;
795 db = mdev->rs_total;
796 dbdt = Bit2KB(db/dt);
797 mdev->rs_paused /= HZ;
798
799 if (!get_ldev(mdev))
800 goto out;
801
802 spin_lock_irq(&mdev->req_lock);
803 os = mdev->state;
804
805 /* This protects us against multiple calls (that can happen in the presence
806 of application IO), and against connectivity loss just before we arrive here. */
807 if (os.conn <= C_CONNECTED)
808 goto out_unlock;
809
810 ns = os;
811 ns.conn = C_CONNECTED;
812
813 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
814 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
815 "Online verify " : "Resync",
816 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
817
818 n_oos = drbd_bm_total_weight(mdev);
819
820 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
821 if (n_oos) {
822 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
823 n_oos, Bit2KB(1));
824 khelper_cmd = "out-of-sync";
825 }
826 } else {
827 D_ASSERT((n_oos - mdev->rs_failed) == 0);
828
829 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
830 khelper_cmd = "after-resync-target";
831
832 if (mdev->csums_tfm && mdev->rs_total) {
833 const unsigned long s = mdev->rs_same_csum;
834 const unsigned long t = mdev->rs_total;
835 const int ratio =
836 (t == 0) ? 0 :
837 (t < 100000) ? ((s*100)/t) : (s/(t/100));
838 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
839 "transferred %luK total %luK\n",
840 ratio,
841 Bit2KB(mdev->rs_same_csum),
842 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
843 Bit2KB(mdev->rs_total));
844 }
845 }
846
847 if (mdev->rs_failed) {
848 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
849
850 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
851 ns.disk = D_INCONSISTENT;
852 ns.pdsk = D_UP_TO_DATE;
853 } else {
854 ns.disk = D_UP_TO_DATE;
855 ns.pdsk = D_INCONSISTENT;
856 }
857 } else {
858 ns.disk = D_UP_TO_DATE;
859 ns.pdsk = D_UP_TO_DATE;
860
861 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
862 if (mdev->p_uuid) {
863 int i;
864 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
865 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
866 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
867 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
868 } else {
869 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
870 }
871 }
872
873 drbd_uuid_set_bm(mdev, 0UL);
874
875 if (mdev->p_uuid) {
876 /* Now the two UUID sets are equal, update what we
877 * know of the peer. */
878 int i;
879 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
880 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
881 }
882 }
883
884 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
885out_unlock:
886 spin_unlock_irq(&mdev->req_lock);
887 put_ldev(mdev);
888out:
889 mdev->rs_total = 0;
890 mdev->rs_failed = 0;
891 mdev->rs_paused = 0;
892 mdev->ov_start_sector = 0;
893
894 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
895 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
896 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
897 }
898
899 if (khelper_cmd)
900 drbd_khelper(mdev, khelper_cmd);
901
902 return 1;
903}
904
905/* helper */
906static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
907{
45bb912b 908 if (drbd_ee_has_active_page(e)) {
b411b363
PR
909 /* This might happen if sendpage() has not finished */
910 spin_lock_irq(&mdev->req_lock);
911 list_add_tail(&e->w.list, &mdev->net_ee);
912 spin_unlock_irq(&mdev->req_lock);
913 } else
914 drbd_free_ee(mdev, e);
915}
916
917/**
918 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
919 * @mdev: DRBD device.
920 * @w: work object.
921 * @cancel: The connection will be closed anyways
922 */
923int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
924{
925 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
926 int ok;
927
928 if (unlikely(cancel)) {
929 drbd_free_ee(mdev, e);
930 dec_unacked(mdev);
931 return 1;
932 }
933
45bb912b 934 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
935 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
936 } else {
937 if (__ratelimit(&drbd_ratelimit_state))
938 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
939 (unsigned long long)e->sector);
940
941 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
942 }
943
944 dec_unacked(mdev);
945
946 move_to_net_ee_or_free(mdev, e);
947
948 if (unlikely(!ok))
949 dev_err(DEV, "drbd_send_block() failed\n");
950 return ok;
951}
952
953/**
954 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
955 * @mdev: DRBD device.
956 * @w: work object.
957 * @cancel: The connection will be closed anyways
958 */
959int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
960{
961 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
962 int ok;
963
964 if (unlikely(cancel)) {
965 drbd_free_ee(mdev, e);
966 dec_unacked(mdev);
967 return 1;
968 }
969
970 if (get_ldev_if_state(mdev, D_FAILED)) {
971 drbd_rs_complete_io(mdev, e->sector);
972 put_ldev(mdev);
973 }
974
45bb912b 975 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
976 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
977 inc_rs_pending(mdev);
978 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
979 } else {
980 if (__ratelimit(&drbd_ratelimit_state))
981 dev_err(DEV, "Not sending RSDataReply, "
982 "partner DISKLESS!\n");
983 ok = 1;
984 }
985 } else {
986 if (__ratelimit(&drbd_ratelimit_state))
987 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
988 (unsigned long long)e->sector);
989
990 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
991
992 /* update resync data with failure */
993 drbd_rs_failed_io(mdev, e->sector, e->size);
994 }
995
996 dec_unacked(mdev);
997
998 move_to_net_ee_or_free(mdev, e);
999
1000 if (unlikely(!ok))
1001 dev_err(DEV, "drbd_send_block() failed\n");
1002 return ok;
1003}
1004
1005int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1006{
1007 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1008 struct digest_info *di;
1009 int digest_size;
1010 void *digest = NULL;
1011 int ok, eq = 0;
1012
1013 if (unlikely(cancel)) {
1014 drbd_free_ee(mdev, e);
1015 dec_unacked(mdev);
1016 return 1;
1017 }
1018
1019 drbd_rs_complete_io(mdev, e->sector);
1020
85719573 1021 di = e->digest;
b411b363 1022
45bb912b 1023 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1024 /* quick hack to try to avoid a race against reconfiguration.
1025 * a real fix would be much more involved,
1026 * introducing more locking mechanisms */
1027 if (mdev->csums_tfm) {
1028 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
1029 D_ASSERT(digest_size == di->digest_size);
1030 digest = kmalloc(digest_size, GFP_NOIO);
1031 }
1032 if (digest) {
45bb912b 1033 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
b411b363
PR
1034 eq = !memcmp(digest, di->digest, digest_size);
1035 kfree(digest);
1036 }
1037
1038 if (eq) {
1039 drbd_set_in_sync(mdev, e->sector, e->size);
676396d5
LE
1040 /* rs_same_csums unit is BM_BLOCK_SIZE */
1041 mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
b411b363
PR
1042 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
1043 } else {
1044 inc_rs_pending(mdev);
1045 e->block_id = ID_SYNCER;
1046 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
1047 }
1048 } else {
1049 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1050 if (__ratelimit(&drbd_ratelimit_state))
1051 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1052 }
1053
1054 dec_unacked(mdev);
b411b363
PR
1055 move_to_net_ee_or_free(mdev, e);
1056
1057 if (unlikely(!ok))
1058 dev_err(DEV, "drbd_send_block/ack() failed\n");
1059 return ok;
1060}
1061
1062int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1063{
1064 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1065 int digest_size;
1066 void *digest;
1067 int ok = 1;
1068
1069 if (unlikely(cancel))
1070 goto out;
1071
45bb912b 1072 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
b411b363
PR
1073 goto out;
1074
1075 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1076 /* FIXME if this allocation fails, online verify will not terminate! */
1077 digest = kmalloc(digest_size, GFP_NOIO);
1078 if (digest) {
45bb912b 1079 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
b411b363
PR
1080 inc_rs_pending(mdev);
1081 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1082 digest, digest_size, P_OV_REPLY);
1083 if (!ok)
1084 dec_rs_pending(mdev);
1085 kfree(digest);
1086 }
1087
1088out:
1089 drbd_free_ee(mdev, e);
1090
1091 dec_unacked(mdev);
1092
1093 return ok;
1094}
1095
1096void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1097{
1098 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1099 mdev->ov_last_oos_size += size>>9;
1100 } else {
1101 mdev->ov_last_oos_start = sector;
1102 mdev->ov_last_oos_size = size>>9;
1103 }
1104 drbd_set_out_of_sync(mdev, sector, size);
1105 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1106}
1107
1108int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1109{
1110 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1111 struct digest_info *di;
1112 int digest_size;
1113 void *digest;
1114 int ok, eq = 0;
1115
1116 if (unlikely(cancel)) {
1117 drbd_free_ee(mdev, e);
1118 dec_unacked(mdev);
1119 return 1;
1120 }
1121
1122 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1123 * the resync lru has been cleaned up already */
1124 drbd_rs_complete_io(mdev, e->sector);
1125
85719573 1126 di = e->digest;
b411b363 1127
45bb912b 1128 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1129 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1130 digest = kmalloc(digest_size, GFP_NOIO);
1131 if (digest) {
45bb912b 1132 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
b411b363
PR
1133
1134 D_ASSERT(digest_size == di->digest_size);
1135 eq = !memcmp(digest, di->digest, digest_size);
1136 kfree(digest);
1137 }
1138 } else {
1139 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1140 if (__ratelimit(&drbd_ratelimit_state))
1141 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1142 }
1143
1144 dec_unacked(mdev);
b411b363
PR
1145 if (!eq)
1146 drbd_ov_oos_found(mdev, e->sector, e->size);
1147 else
1148 ov_oos_print(mdev);
1149
1150 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1151 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1152
1153 drbd_free_ee(mdev, e);
1154
1155 if (--mdev->ov_left == 0) {
1156 ov_oos_print(mdev);
1157 drbd_resync_finished(mdev);
1158 }
1159
1160 return ok;
1161}
1162
1163int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1164{
1165 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1166 complete(&b->done);
1167 return 1;
1168}
1169
1170int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1171{
1172 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1173 struct p_barrier *p = &mdev->data.sbuf.barrier;
1174 int ok = 1;
1175
1176 /* really avoid racing with tl_clear. w.cb may have been referenced
1177 * just before it was reassigned and re-queued, so double check that.
1178 * actually, this race was harmless, since we only try to send the
1179 * barrier packet here, and otherwise do nothing with the object.
1180 * but compare with the head of w_clear_epoch */
1181 spin_lock_irq(&mdev->req_lock);
1182 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1183 cancel = 1;
1184 spin_unlock_irq(&mdev->req_lock);
1185 if (cancel)
1186 return 1;
1187
1188 if (!drbd_get_data_sock(mdev))
1189 return 0;
1190 p->barrier = b->br_number;
1191 /* inc_ap_pending was done where this was queued.
1192 * dec_ap_pending will be done in got_BarrierAck
1193 * or (on connection loss) in w_clear_epoch. */
1194 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1195 (struct p_header *)p, sizeof(*p), 0);
1196 drbd_put_data_sock(mdev);
1197
1198 return ok;
1199}
1200
1201int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1202{
1203 if (cancel)
1204 return 1;
1205 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1206}
1207
1208/**
1209 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1210 * @mdev: DRBD device.
1211 * @w: work object.
1212 * @cancel: The connection will be closed anyways
1213 */
1214int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1215{
1216 struct drbd_request *req = container_of(w, struct drbd_request, w);
1217 int ok;
1218
1219 if (unlikely(cancel)) {
1220 req_mod(req, send_canceled);
1221 return 1;
1222 }
1223
1224 ok = drbd_send_dblock(mdev, req);
1225 req_mod(req, ok ? handed_over_to_network : send_failed);
1226
1227 return ok;
1228}
1229
1230/**
1231 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1232 * @mdev: DRBD device.
1233 * @w: work object.
1234 * @cancel: The connection will be closed anyways
1235 */
1236int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1237{
1238 struct drbd_request *req = container_of(w, struct drbd_request, w);
1239 int ok;
1240
1241 if (unlikely(cancel)) {
1242 req_mod(req, send_canceled);
1243 return 1;
1244 }
1245
1246 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1247 (unsigned long)req);
1248
1249 if (!ok) {
1250 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1251 * so this is probably redundant */
1252 if (mdev->state.conn >= C_CONNECTED)
1253 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1254 }
1255 req_mod(req, ok ? handed_over_to_network : send_failed);
1256
1257 return ok;
1258}
1259
265be2d0
PR
1260int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1261{
1262 struct drbd_request *req = container_of(w, struct drbd_request, w);
1263
1264 if (bio_data_dir(req->master_bio) == WRITE)
1265 drbd_al_begin_io(mdev, req->sector);
1266 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1267 theoretically. Practically it can not deadlock, since this is
1268 only used when unfreezing IOs. All the extents of the requests
1269 that made it into the TL are already active */
1270
1271 drbd_req_make_private_bio(req, req->master_bio);
1272 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
1273 generic_make_request(req->private_bio);
1274
1275 return 1;
1276}
1277
b411b363
PR
1278static int _drbd_may_sync_now(struct drbd_conf *mdev)
1279{
1280 struct drbd_conf *odev = mdev;
1281
1282 while (1) {
1283 if (odev->sync_conf.after == -1)
1284 return 1;
1285 odev = minor_to_mdev(odev->sync_conf.after);
1286 ERR_IF(!odev) return 1;
1287 if ((odev->state.conn >= C_SYNC_SOURCE &&
1288 odev->state.conn <= C_PAUSED_SYNC_T) ||
1289 odev->state.aftr_isp || odev->state.peer_isp ||
1290 odev->state.user_isp)
1291 return 0;
1292 }
1293}
1294
1295/**
1296 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1297 * @mdev: DRBD device.
1298 *
1299 * Called from process context only (admin command and after_state_ch).
1300 */
1301static int _drbd_pause_after(struct drbd_conf *mdev)
1302{
1303 struct drbd_conf *odev;
1304 int i, rv = 0;
1305
1306 for (i = 0; i < minor_count; i++) {
1307 odev = minor_to_mdev(i);
1308 if (!odev)
1309 continue;
1310 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1311 continue;
1312 if (!_drbd_may_sync_now(odev))
1313 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1314 != SS_NOTHING_TO_DO);
1315 }
1316
1317 return rv;
1318}
1319
1320/**
1321 * _drbd_resume_next() - Resume resync on all devices that may resync now
1322 * @mdev: DRBD device.
1323 *
1324 * Called from process context only (admin command and worker).
1325 */
1326static int _drbd_resume_next(struct drbd_conf *mdev)
1327{
1328 struct drbd_conf *odev;
1329 int i, rv = 0;
1330
1331 for (i = 0; i < minor_count; i++) {
1332 odev = minor_to_mdev(i);
1333 if (!odev)
1334 continue;
1335 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1336 continue;
1337 if (odev->state.aftr_isp) {
1338 if (_drbd_may_sync_now(odev))
1339 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1340 CS_HARD, NULL)
1341 != SS_NOTHING_TO_DO) ;
1342 }
1343 }
1344 return rv;
1345}
1346
1347void resume_next_sg(struct drbd_conf *mdev)
1348{
1349 write_lock_irq(&global_state_lock);
1350 _drbd_resume_next(mdev);
1351 write_unlock_irq(&global_state_lock);
1352}
1353
1354void suspend_other_sg(struct drbd_conf *mdev)
1355{
1356 write_lock_irq(&global_state_lock);
1357 _drbd_pause_after(mdev);
1358 write_unlock_irq(&global_state_lock);
1359}
1360
1361static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1362{
1363 struct drbd_conf *odev;
1364
1365 if (o_minor == -1)
1366 return NO_ERROR;
1367 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1368 return ERR_SYNC_AFTER;
1369
1370 /* check for loops */
1371 odev = minor_to_mdev(o_minor);
1372 while (1) {
1373 if (odev == mdev)
1374 return ERR_SYNC_AFTER_CYCLE;
1375
1376 /* dependency chain ends here, no cycles. */
1377 if (odev->sync_conf.after == -1)
1378 return NO_ERROR;
1379
1380 /* follow the dependency chain */
1381 odev = minor_to_mdev(odev->sync_conf.after);
1382 }
1383}
1384
1385int drbd_alter_sa(struct drbd_conf *mdev, int na)
1386{
1387 int changes;
1388 int retcode;
1389
1390 write_lock_irq(&global_state_lock);
1391 retcode = sync_after_error(mdev, na);
1392 if (retcode == NO_ERROR) {
1393 mdev->sync_conf.after = na;
1394 do {
1395 changes = _drbd_pause_after(mdev);
1396 changes |= _drbd_resume_next(mdev);
1397 } while (changes);
1398 }
1399 write_unlock_irq(&global_state_lock);
1400 return retcode;
1401}
1402
309d1608
PR
1403static void ping_peer(struct drbd_conf *mdev)
1404{
1405 clear_bit(GOT_PING_ACK, &mdev->flags);
1406 request_ping(mdev);
1407 wait_event(mdev->misc_wait,
1408 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1409}
1410
b411b363
PR
1411/**
1412 * drbd_start_resync() - Start the resync process
1413 * @mdev: DRBD device.
1414 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1415 *
1416 * This function might bring you directly into one of the
1417 * C_PAUSED_SYNC_* states.
1418 */
1419void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1420{
1421 union drbd_state ns;
1422 int r;
1423
1424 if (mdev->state.conn >= C_SYNC_SOURCE) {
1425 dev_err(DEV, "Resync already running!\n");
1426 return;
1427 }
1428
b411b363
PR
1429 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1430 drbd_rs_cancel_all(mdev);
1431
1432 if (side == C_SYNC_TARGET) {
1433 /* Since application IO was locked out during C_WF_BITMAP_T and
1434 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1435 we check that we might make the data inconsistent. */
1436 r = drbd_khelper(mdev, "before-resync-target");
1437 r = (r >> 8) & 0xff;
1438 if (r > 0) {
1439 dev_info(DEV, "before-resync-target handler returned %d, "
1440 "dropping connection.\n", r);
1441 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1442 return;
1443 }
1444 }
1445
1446 drbd_state_lock(mdev);
1447
1448 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1449 drbd_state_unlock(mdev);
1450 return;
1451 }
1452
1453 if (side == C_SYNC_TARGET) {
1454 mdev->bm_resync_fo = 0;
1455 } else /* side == C_SYNC_SOURCE */ {
1456 u64 uuid;
1457
1458 get_random_bytes(&uuid, sizeof(u64));
1459 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1460 drbd_send_sync_uuid(mdev, uuid);
1461
1462 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1463 }
1464
1465 write_lock_irq(&global_state_lock);
1466 ns = mdev->state;
1467
1468 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1469
1470 ns.conn = side;
1471
1472 if (side == C_SYNC_TARGET)
1473 ns.disk = D_INCONSISTENT;
1474 else /* side == C_SYNC_SOURCE */
1475 ns.pdsk = D_INCONSISTENT;
1476
1477 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1478 ns = mdev->state;
1479
1480 if (ns.conn < C_CONNECTED)
1481 r = SS_UNKNOWN_ERROR;
1482
1483 if (r == SS_SUCCESS) {
1d7734a0
LE
1484 unsigned long tw = drbd_bm_total_weight(mdev);
1485 unsigned long now = jiffies;
1486 int i;
1487
b411b363
PR
1488 mdev->rs_failed = 0;
1489 mdev->rs_paused = 0;
b411b363 1490 mdev->rs_same_csum = 0;
1d7734a0
LE
1491 mdev->rs_total = tw;
1492 mdev->rs_start = now;
1493 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1494 mdev->rs_mark_left[i] = tw;
1495 mdev->rs_mark_time[i] = now;
1496 }
b411b363
PR
1497 _drbd_pause_after(mdev);
1498 }
1499 write_unlock_irq(&global_state_lock);
b411b363
PR
1500 put_ldev(mdev);
1501
1502 if (r == SS_SUCCESS) {
1503 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1504 drbd_conn_str(ns.conn),
1505 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1506 (unsigned long) mdev->rs_total);
1507
1508 if (mdev->rs_total == 0) {
1509 /* Peer still reachable? Beware of failing before-resync-target handlers! */
309d1608 1510 ping_peer(mdev);
b411b363 1511 drbd_resync_finished(mdev);
b411b363
PR
1512 }
1513
778f271d
PR
1514 atomic_set(&mdev->rs_sect_in, 0);
1515 mdev->rs_in_flight = 0;
1516 mdev->rs_planed = 0;
1517 spin_lock(&mdev->peer_seq_lock);
1518 fifo_set(&mdev->rs_plan_s, 0);
1519 spin_unlock(&mdev->peer_seq_lock);
b411b363
PR
1520 /* ns.conn may already be != mdev->state.conn,
1521 * we may have been paused in between, or become paused until
1522 * the timer triggers.
1523 * No matter, that is handled in resync_timer_fn() */
1524 if (ns.conn == C_SYNC_TARGET)
1525 mod_timer(&mdev->resync_timer, jiffies);
1526
1527 drbd_md_sync(mdev);
1528 }
d0c3f60f 1529 drbd_state_unlock(mdev);
b411b363
PR
1530}
1531
1532int drbd_worker(struct drbd_thread *thi)
1533{
1534 struct drbd_conf *mdev = thi->mdev;
1535 struct drbd_work *w = NULL;
1536 LIST_HEAD(work_list);
1537 int intr = 0, i;
1538
1539 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1540
1541 while (get_t_state(thi) == Running) {
1542 drbd_thread_current_set_cpu(mdev);
1543
1544 if (down_trylock(&mdev->data.work.s)) {
1545 mutex_lock(&mdev->data.mutex);
1546 if (mdev->data.socket && !mdev->net_conf->no_cork)
1547 drbd_tcp_uncork(mdev->data.socket);
1548 mutex_unlock(&mdev->data.mutex);
1549
1550 intr = down_interruptible(&mdev->data.work.s);
1551
1552 mutex_lock(&mdev->data.mutex);
1553 if (mdev->data.socket && !mdev->net_conf->no_cork)
1554 drbd_tcp_cork(mdev->data.socket);
1555 mutex_unlock(&mdev->data.mutex);
1556 }
1557
1558 if (intr) {
1559 D_ASSERT(intr == -EINTR);
1560 flush_signals(current);
1561 ERR_IF (get_t_state(thi) == Running)
1562 continue;
1563 break;
1564 }
1565
1566 if (get_t_state(thi) != Running)
1567 break;
1568 /* With this break, we have done a down() but not consumed
1569 the entry from the list. The cleanup code takes care of
1570 this... */
1571
1572 w = NULL;
1573 spin_lock_irq(&mdev->data.work.q_lock);
1574 ERR_IF(list_empty(&mdev->data.work.q)) {
1575 /* something terribly wrong in our logic.
1576 * we were able to down() the semaphore,
1577 * but the list is empty... doh.
1578 *
1579 * what is the best thing to do now?
1580 * try again from scratch, restarting the receiver,
1581 * asender, whatnot? could break even more ugly,
1582 * e.g. when we are primary, but no good local data.
1583 *
1584 * I'll try to get away just starting over this loop.
1585 */
1586 spin_unlock_irq(&mdev->data.work.q_lock);
1587 continue;
1588 }
1589 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1590 list_del_init(&w->list);
1591 spin_unlock_irq(&mdev->data.work.q_lock);
1592
1593 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1594 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1595 if (mdev->state.conn >= C_CONNECTED)
1596 drbd_force_state(mdev,
1597 NS(conn, C_NETWORK_FAILURE));
1598 }
1599 }
1600 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1601 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1602
1603 spin_lock_irq(&mdev->data.work.q_lock);
1604 i = 0;
1605 while (!list_empty(&mdev->data.work.q)) {
1606 list_splice_init(&mdev->data.work.q, &work_list);
1607 spin_unlock_irq(&mdev->data.work.q_lock);
1608
1609 while (!list_empty(&work_list)) {
1610 w = list_entry(work_list.next, struct drbd_work, list);
1611 list_del_init(&w->list);
1612 w->cb(mdev, w, 1);
1613 i++; /* dead debugging code */
1614 }
1615
1616 spin_lock_irq(&mdev->data.work.q_lock);
1617 }
1618 sema_init(&mdev->data.work.s, 0);
1619 /* DANGEROUS race: if someone did queue his work within the spinlock,
1620 * but up() ed outside the spinlock, we could get an up() on the
1621 * semaphore without corresponding list entry.
1622 * So don't do that.
1623 */
1624 spin_unlock_irq(&mdev->data.work.q_lock);
1625
1626 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1627 /* _drbd_set_state only uses stop_nowait.
1628 * wait here for the Exiting receiver. */
1629 drbd_thread_stop(&mdev->receiver);
1630 drbd_mdev_cleanup(mdev);
1631
1632 dev_info(DEV, "worker terminated\n");
1633
1634 clear_bit(DEVICE_DYING, &mdev->flags);
1635 clear_bit(CONFIG_PENDING, &mdev->flags);
1636 wake_up(&mdev->state_wait);
1637
1638 return 0;
1639}